{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 871.25, "completions/max_terminated_length": 820.5, "completions/mean_length": 635.265625, "completions/mean_terminated_length": 614.8063507080078, "completions/min_length": 324.5, "completions/min_terminated_length": 324.5, "epoch": 0.0003333333333333333, "grad_norm": 0.7860351204872131, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0304, "num_tokens": 50705.0, "reward": 0.22903646156191826, "reward_std": 0.18850377202033997, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.4166666716337204, "rewards/reasoning_steps_reward/std": 0.36324381828308105, "rewards/tag_count_reward/mean": 0.14453125, "rewards/tag_count_reward/std": 0.25423113256692886, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 921.0, "completions/max_terminated_length": 869.75, "completions/mean_length": 679.21875, "completions/mean_terminated_length": 612.8114471435547, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.0006666666666666666, "grad_norm": 0.6927920579910278, "kl": 0.0, "learning_rate": 6.666666666666668e-08, "loss": -0.112, "num_tokens": 105535.0, "reward": 0.28307291865348816, "reward_std": 0.20059899613261223, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.21039126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.4895833507180214, "rewards/reasoning_steps_reward/std": 0.34962671622633934, "rewards/tag_count_reward/mean": 0.1328125, "rewards/tag_count_reward/std": 0.28246864676475525, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 989.75, "completions/max_terminated_length": 917.75, "completions/mean_length": 609.421875, "completions/mean_terminated_length": 550.6205444335938, "completions/min_length": 274.5, "completions/min_terminated_length": 274.5, "epoch": 0.001, "grad_norm": 0.8291285037994385, "kl": 0.00030994415283203125, "learning_rate": 1.3333333333333336e-07, "loss": -0.0794, "num_tokens": 153722.0, "reward": 0.16028646053746343, "reward_std": 0.1685022683814168, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.29166666930541396, "rewards/reasoning_steps_reward/std": 0.32445300184190273, "rewards/tag_count_reward/mean": 0.08203125, "rewards/tag_count_reward/std": 0.2137749269604683, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 987.75, "completions/max_terminated_length": 894.25, "completions/mean_length": 632.671875, "completions/mean_terminated_length": 609.0352783203125, "completions/min_length": 299.25, "completions/min_terminated_length": 299.25, "epoch": 0.0013333333333333333, "grad_norm": 0.6764264702796936, "kl": 0.0003440380096435547, "learning_rate": 2.0000000000000002e-07, "loss": 0.0428, "num_tokens": 204677.0, "reward": 0.45119864493608475, "reward_std": 0.5436567962169647, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.17671946436166763, "rewards/penalized_accuracy_reward/std": 0.4693755432963371, "rewards/reasoning_steps_reward/mean": 0.4895833432674408, "rewards/reasoning_steps_reward/std": 0.364622987806797, "rewards/tag_count_reward/mean": 0.171875, "rewards/tag_count_reward/std": 0.321233332157135, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 890.25, "completions/max_terminated_length": 841.75, "completions/mean_length": 495.625, "completions/mean_terminated_length": 484.78125762939453, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.0016666666666666668, "grad_norm": 0.9639339447021484, "kl": 0.0004448890686035156, "learning_rate": 2.666666666666667e-07, "loss": -0.1647, "num_tokens": 245197.0, "reward": 0.2981770969927311, "reward_std": 0.34278450906276703, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.07265625149011612, "rewards/penalized_accuracy_reward/std": 0.15937501192092896, "rewards/reasoning_steps_reward/mean": 0.4010416679084301, "rewards/reasoning_steps_reward/std": 0.41512854397296906, "rewards/tag_count_reward/mean": 0.1875, "rewards/tag_count_reward/std": 0.3145124241709709, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 988.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 731.109375, "completions/mean_terminated_length": 701.204345703125, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.002, "grad_norm": 0.7105857729911804, "kl": 0.00043010711669921875, "learning_rate": 3.3333333333333335e-07, "loss": -0.0603, "num_tokens": 305460.0, "reward": 0.17559084296226501, "reward_std": 0.3042265921831131, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.031580422073602676, "rewards/penalized_accuracy_reward/std": 0.1263216882944107, "rewards/reasoning_steps_reward/mean": 0.2708333432674408, "rewards/reasoning_steps_reward/std": 0.3607480823993683, "rewards/tag_count_reward/mean": 0.0234375, "rewards/tag_count_reward/std": 0.06404344737529755, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1018.25, "completions/max_terminated_length": 893.5, "completions/mean_length": 684.03125, "completions/mean_terminated_length": 628.0986251831055, "completions/min_length": 351.25, "completions/min_terminated_length": 351.25, "epoch": 0.0023333333333333335, "grad_norm": 0.7914196848869324, "kl": 0.00037479400634765625, "learning_rate": 4.0000000000000003e-07, "loss": -0.0261, "num_tokens": 360710.0, "reward": 0.3031249940395355, "reward_std": 0.24116554856300354, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.21039126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.5312500074505806, "rewards/reasoning_steps_reward/std": 0.4420531764626503, "rewards/tag_count_reward/mean": 0.125, "rewards/tag_count_reward/std": 0.26729242503643036, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 874.75, "completions/max_terminated_length": 825.0, "completions/mean_length": 621.640625, "completions/mean_terminated_length": 583.5256652832031, "completions/min_length": 289.5, "completions/min_terminated_length": 289.5, "epoch": 0.0026666666666666666, "grad_norm": 0.6808961629867554, "kl": 0.000308990478515625, "learning_rate": 4.666666666666667e-07, "loss": 0.0044, "num_tokens": 408223.0, "reward": 0.16835937835276127, "reward_std": 0.1593070924282074, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.10077822208404541, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.2812500111758709, "rewards/reasoning_steps_reward/std": 0.2916427403688431, "rewards/tag_count_reward/mean": 0.08984375, "rewards/tag_count_reward/std": 0.21786238253116608, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 926.25, "completions/max_terminated_length": 918.0, "completions/mean_length": 631.046875, "completions/mean_terminated_length": 597.6302185058594, "completions/min_length": 348.5, "completions/min_terminated_length": 348.5, "epoch": 0.003, "grad_norm": 0.7980362772941589, "kl": 0.0003371238708496094, "learning_rate": 5.333333333333335e-07, "loss": -0.0431, "num_tokens": 457730.0, "reward": 0.37369417771697044, "reward_std": 0.43127137050032616, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.13189728558063507, "rewards/penalized_accuracy_reward/std": 0.33436600118875504, "rewards/reasoning_steps_reward/mean": 0.4375000074505806, "rewards/reasoning_steps_reward/std": 0.3658451661467552, "rewards/tag_count_reward/mean": 0.10546875, "rewards/tag_count_reward/std": 0.26681406423449516, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 953.25, "completions/max_terminated_length": 921.25, "completions/mean_length": 744.0625, "completions/mean_terminated_length": 680.9791870117188, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.0033333333333333335, "grad_norm": 0.5989229083061218, "kl": 0.0002636909484863281, "learning_rate": 6.000000000000001e-07, "loss": 0.0488, "num_tokens": 515910.0, "reward": 0.3035855982452631, "reward_std": 0.37139888666570187, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.1660856008529663, "rewards/penalized_accuracy_reward/std": 0.2546969950199127, "rewards/reasoning_steps_reward/mean": 0.2499999962747097, "rewards/reasoning_steps_reward/std": 0.22616704553365707, "rewards/tag_count_reward/mean": 0.0625, "rewards/tag_count_reward/std": 0.1956711709499359, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 893.25, "completions/max_terminated_length": 866.25, "completions/mean_length": 608.453125, "completions/mean_terminated_length": 562.5375061035156, "completions/min_length": 249.75, "completions/min_terminated_length": 249.75, "epoch": 0.0036666666666666666, "grad_norm": 0.8321331739425659, "kl": 0.0003871917724609375, "learning_rate": 6.666666666666667e-07, "loss": -0.0635, "num_tokens": 566691.0, "reward": 0.3195977807044983, "reward_std": 0.2748823333531618, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.17078252136707306, "rewards/penalized_accuracy_reward/mean": 0.027670694515109062, "rewards/penalized_accuracy_reward/std": 0.11068278551101685, "rewards/reasoning_steps_reward/mean": 0.4947916716337204, "rewards/reasoning_steps_reward/std": 0.3461822122335434, "rewards/tag_count_reward/mean": 0.1953125, "rewards/tag_count_reward/std": 0.3081725612282753, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 947.25, "completions/max_terminated_length": 913.5, "completions/mean_length": 700.21875, "completions/mean_terminated_length": 651.2534484863281, "completions/min_length": 370.75, "completions/min_terminated_length": 370.75, "epoch": 0.004, "grad_norm": 0.64812833070755, "kl": 0.0003380775451660156, "learning_rate": 7.333333333333334e-07, "loss": -0.0634, "num_tokens": 620673.0, "reward": 0.28919271379709244, "reward_std": 0.19906166940927505, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.557291679084301, "rewards/reasoning_steps_reward/std": 0.3937826156616211, "rewards/tag_count_reward/mean": 0.10546875, "rewards/tag_count_reward/std": 0.2427176907658577, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 932.5, "completions/max_terminated_length": 899.25, "completions/mean_length": 684.0625, "completions/mean_terminated_length": 641.3697967529297, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.004333333333333333, "grad_norm": 0.7295641899108887, "kl": 0.0003490447998046875, "learning_rate": 8.000000000000001e-07, "loss": -0.0096, "num_tokens": 673253.0, "reward": 0.19283854216337204, "reward_std": 0.23215484991669655, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.14789126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.3177083432674408, "rewards/reasoning_steps_reward/std": 0.38450102508068085, "rewards/tag_count_reward/mean": 0.15234375, "rewards/tag_count_reward/std": 0.3064930960536003, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 988.0, "completions/max_terminated_length": 930.75, "completions/mean_length": 740.4375, "completions/mean_terminated_length": 700.2864837646484, "completions/min_length": 381.25, "completions/min_terminated_length": 381.25, "epoch": 0.004666666666666667, "grad_norm": 0.6185789108276367, "kl": 0.00036716461181640625, "learning_rate": 8.666666666666668e-07, "loss": 0.0149, "num_tokens": 735169.0, "reward": 0.22842025011777878, "reward_std": 0.3898373916745186, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.10159732773900032, "rewards/penalized_accuracy_reward/std": 0.3202204257249832, "rewards/reasoning_steps_reward/mean": 0.22395833767950535, "rewards/reasoning_steps_reward/std": 0.2862655222415924, "rewards/tag_count_reward/mean": 0.0859375, "rewards/tag_count_reward/std": 0.21858105063438416, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.75, "completions/mean_length": 719.765625, "completions/mean_terminated_length": 653.96728515625, "completions/min_length": 277.75, "completions/min_terminated_length": 277.75, "epoch": 0.005, "grad_norm": 0.672243595123291, "kl": 0.000499725341796875, "learning_rate": 9.333333333333334e-07, "loss": -0.0536, "num_tokens": 791810.0, "reward": 0.3214690247550607, "reward_std": 0.4147674571722746, "rewards/format_reward/mean": 0.078125, "rewards/format_reward/std": 0.18616948276758194, "rewards/penalized_accuracy_reward/mean": 0.14217214286327362, "rewards/penalized_accuracy_reward/std": 0.2546563148498535, "rewards/reasoning_steps_reward/mean": 0.250000006519258, "rewards/reasoning_steps_reward/std": 0.3242802955210209, "rewards/tag_count_reward/mean": 0.23046875, "rewards/tag_count_reward/std": 0.3277582451701164, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 735.25, "completions/max_terminated_length": 727.0, "completions/mean_length": 570.8125, "completions/mean_terminated_length": 559.7447967529297, "completions/min_length": 406.75, "completions/min_terminated_length": 406.75, "epoch": 0.005333333333333333, "grad_norm": 0.8322139382362366, "kl": 0.00043010711669921875, "learning_rate": 1.0000000000000002e-06, "loss": 0.0246, "num_tokens": 839238.0, "reward": 0.6507812291383743, "reward_std": 0.36999223567545414, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.21039126068353653, "rewards/penalized_accuracy_reward/mean": 0.30078125, "rewards/penalized_accuracy_reward/std": 0.20943717658519745, "rewards/reasoning_steps_reward/mean": 0.6250000149011612, "rewards/reasoning_steps_reward/std": 0.3061271086335182, "rewards/tag_count_reward/mean": 0.125, "rewards/tag_count_reward/std": 0.28826820850372314, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 894.75, "completions/max_terminated_length": 852.25, "completions/mean_length": 616.734375, "completions/mean_terminated_length": 578.4989624023438, "completions/min_length": 225.5, "completions/min_terminated_length": 225.5, "epoch": 0.005666666666666667, "grad_norm": 0.6916234493255615, "kl": 0.0005021095275878906, "learning_rate": 1.066666666666667e-06, "loss": -0.0533, "num_tokens": 887797.0, "reward": 0.20742188021540642, "reward_std": 0.19378306157886982, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.3437500037252903, "rewards/reasoning_steps_reward/std": 0.34409795701503754, "rewards/tag_count_reward/mean": 0.23046875, "rewards/tag_count_reward/std": 0.4111901819705963, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 861.25, "completions/max_terminated_length": 796.0, "completions/mean_length": 591.015625, "completions/mean_terminated_length": 568.8747711181641, "completions/min_length": 287.75, "completions/min_terminated_length": 287.75, "epoch": 0.006, "grad_norm": 0.9087463021278381, "kl": 0.0009756088256835938, "learning_rate": 1.1333333333333334e-06, "loss": 0.0337, "num_tokens": 934310.0, "reward": 0.19101562723517418, "reward_std": 0.2238428182899952, "rewards/format_reward/mean": 0.15625, "rewards/format_reward/std": 0.3604728877544403, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.1718750037252903, "rewards/reasoning_steps_reward/std": 0.3026356063783169, "rewards/tag_count_reward/mean": 0.42578125, "rewards/tag_count_reward/std": 0.4487849324941635, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 883.25, "completions/max_terminated_length": 788.25, "completions/mean_length": 555.65625, "completions/mean_terminated_length": 503.95638275146484, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.006333333333333333, "grad_norm": 0.945106029510498, "kl": 0.00121307373046875, "learning_rate": 1.2000000000000002e-06, "loss": -0.0063, "num_tokens": 979584.0, "reward": 0.3121093846857548, "reward_std": 0.27271439135074615, "rewards/format_reward/mean": 0.234375, "rewards/format_reward/std": 0.4255262687802315, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.3593750074505806, "rewards/reasoning_steps_reward/std": 0.3613637499511242, "rewards/tag_count_reward/mean": 0.38671875, "rewards/tag_count_reward/std": 0.4463852792978287, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 600.328125, "completions/mean_terminated_length": 566.3238296508789, "completions/min_length": 263.75, "completions/min_terminated_length": 263.75, "epoch": 0.006666666666666667, "grad_norm": 0.7967208027839661, "kl": 0.0017490386962890625, "learning_rate": 1.2666666666666669e-06, "loss": 0.0029, "num_tokens": 1028293.0, "reward": 0.33867188170552254, "reward_std": 0.2778767757117748, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.4581565484404564, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.3125000074505806, "rewards/reasoning_steps_reward/std": 0.36415334790945053, "rewards/tag_count_reward/mean": 0.57421875, "rewards/tag_count_reward/std": 0.42155271768569946, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 898.75, "completions/max_terminated_length": 840.25, "completions/mean_length": 640.125, "completions/mean_terminated_length": 591.8221282958984, "completions/min_length": 295.5, "completions/min_terminated_length": 295.5, "epoch": 0.007, "grad_norm": 0.7993925213813782, "kl": 0.0019083023071289062, "learning_rate": 1.3333333333333334e-06, "loss": -0.0005, "num_tokens": 1078381.0, "reward": 0.3596354275941849, "reward_std": 0.38519641384482384, "rewards/format_reward/mean": 0.171875, "rewards/format_reward/std": 0.37149807065725327, "rewards/penalized_accuracy_reward/mean": 0.08203125, "rewards/penalized_accuracy_reward/std": 0.17636188864707947, "rewards/reasoning_steps_reward/mean": 0.3333333469927311, "rewards/reasoning_steps_reward/std": 0.3621671050786972, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.422298327088356, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 900.75, "completions/max_terminated_length": 824.5, "completions/mean_length": 662.265625, "completions/mean_terminated_length": 595.5156402587891, "completions/min_length": 389.25, "completions/min_terminated_length": 389.25, "epoch": 0.007333333333333333, "grad_norm": 0.6876639127731323, "kl": 0.0032024383544921875, "learning_rate": 1.4000000000000001e-06, "loss": 0.0381, "num_tokens": 1132766.0, "reward": 0.7374160960316658, "reward_std": 0.6433865800499916, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.44495995342731476, "rewards/penalized_accuracy_reward/mean": 0.38207758590579033, "rewards/penalized_accuracy_reward/std": 0.5425488203763962, "rewards/reasoning_steps_reward/mean": 0.2552083395421505, "rewards/reasoning_steps_reward/std": 0.3171140179038048, "rewards/tag_count_reward/mean": 0.65234375, "rewards/tag_count_reward/std": 0.3613503724336624, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 813.75, "completions/max_terminated_length": 777.0, "completions/mean_length": 478.484375, "completions/mean_terminated_length": 471.4437561035156, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.007666666666666666, "grad_norm": 1.127966284751892, "kl": 0.005817413330078125, "learning_rate": 1.4666666666666669e-06, "loss": -0.0894, "num_tokens": 1173965.0, "reward": 0.611811488866806, "reward_std": 0.4041200578212738, "rewards/format_reward/mean": 0.546875, "rewards/format_reward/std": 0.48989029973745346, "rewards/penalized_accuracy_reward/mean": 0.07938960939645767, "rewards/penalized_accuracy_reward/std": 0.17101971805095673, "rewards/reasoning_steps_reward/mean": 0.4531250176951289, "rewards/reasoning_steps_reward/std": 0.33705293014645576, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.2502099722623825, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 988.25, "completions/max_terminated_length": 892.0, "completions/mean_length": 654.796875, "completions/mean_terminated_length": 599.6551284790039, "completions/min_length": 289.75, "completions/min_terminated_length": 289.75, "epoch": 0.008, "grad_norm": 0.8260610699653625, "kl": 0.004428863525390625, "learning_rate": 1.5333333333333334e-06, "loss": -0.0424, "num_tokens": 1228112.0, "reward": 0.48359375447034836, "reward_std": 0.25352058187127113, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.48989029973745346, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.4687500149011612, "rewards/reasoning_steps_reward/std": 0.36968404054641724, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.30307503417134285, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 969.75, "completions/max_terminated_length": 879.75, "completions/mean_length": 636.75, "completions/mean_terminated_length": 613.5290374755859, "completions/min_length": 303.25, "completions/min_terminated_length": 303.25, "epoch": 0.008333333333333333, "grad_norm": 0.7867465019226074, "kl": 0.00386810302734375, "learning_rate": 1.6000000000000001e-06, "loss": -0.0077, "num_tokens": 1280608.0, "reward": 0.6752409785985947, "reward_std": 0.5305026173591614, "rewards/format_reward/mean": 0.703125, "rewards/format_reward/std": 0.4503342807292938, "rewards/penalized_accuracy_reward/mean": 0.1337045282125473, "rewards/penalized_accuracy_reward/std": 0.36535558104515076, "rewards/reasoning_steps_reward/mean": 0.3385416753590107, "rewards/reasoning_steps_reward/std": 0.3921737000346184, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.17918559536337852, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 956.25, "completions/max_terminated_length": 876.25, "completions/mean_length": 692.78125, "completions/mean_terminated_length": 641.17041015625, "completions/min_length": 356.75, "completions/min_terminated_length": 356.75, "epoch": 0.008666666666666666, "grad_norm": 1.0709264278411865, "kl": 0.020282745361328125, "learning_rate": 1.6666666666666667e-06, "loss": 0.0592, "num_tokens": 1335762.0, "reward": 0.49266771972179413, "reward_std": 0.3373037725687027, "rewards/format_reward/mean": 0.578125, "rewards/format_reward/std": 0.4612434431910515, "rewards/penalized_accuracy_reward/mean": 0.031599994748830795, "rewards/penalized_accuracy_reward/std": 0.12639997899532318, "rewards/reasoning_steps_reward/mean": 0.3072916828095913, "rewards/reasoning_steps_reward/std": 0.3412012457847595, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.2921975739300251, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 862.5, "completions/max_terminated_length": 815.0, "completions/mean_length": 499.21875, "completions/mean_terminated_length": 490.92188262939453, "completions/min_length": 203.25, "completions/min_terminated_length": 203.25, "epoch": 0.009, "grad_norm": 0.7725469470024109, "kl": 0.00894927978515625, "learning_rate": 1.7333333333333336e-06, "loss": -0.0116, "num_tokens": 1378272.0, "reward": 0.6114357858896255, "reward_std": 0.35752149671316147, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.31687305867671967, "rewards/penalized_accuracy_reward/mean": 0.06482118368148804, "rewards/penalized_accuracy_reward/std": 0.17760424315929413, "rewards/reasoning_steps_reward/mean": 0.2447916753590107, "rewards/reasoning_steps_reward/std": 0.3417239859700203, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.16852997615933418, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 907.75, "completions/max_terminated_length": 890.75, "completions/mean_length": 659.234375, "completions/mean_terminated_length": 618.823974609375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.009333333333333334, "grad_norm": 0.6757703423500061, "kl": 0.00604248046875, "learning_rate": 1.8000000000000001e-06, "loss": -0.0214, "num_tokens": 1430415.0, "reward": 0.569692924618721, "reward_std": 0.35038041695952415, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.46566852182149887, "rewards/penalized_accuracy_reward/mean": 0.055760642513632774, "rewards/penalized_accuracy_reward/std": 0.2230425775051117, "rewards/reasoning_steps_reward/mean": 0.3177083395421505, "rewards/reasoning_steps_reward/std": 0.3475276306271553, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.2796638309955597, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 894.5, "completions/max_terminated_length": 778.5, "completions/mean_length": 568.203125, "completions/mean_terminated_length": 526.1045837402344, "completions/min_length": 190.75, "completions/min_terminated_length": 190.75, "epoch": 0.009666666666666667, "grad_norm": 0.9557077884674072, "kl": 0.00942230224609375, "learning_rate": 1.8666666666666669e-06, "loss": 0.0206, "num_tokens": 1478412.0, "reward": 0.6090030297636986, "reward_std": 0.34788935631513596, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4383598491549492, "rewards/penalized_accuracy_reward/mean": 0.0332217775285244, "rewards/penalized_accuracy_reward/std": 0.1328871250152588, "rewards/reasoning_steps_reward/mean": 0.4062500037252903, "rewards/reasoning_steps_reward/std": 0.37067270278930664, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.372597873210907, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 986.75, "completions/max_terminated_length": 968.25, "completions/mean_length": 744.6875, "completions/mean_terminated_length": 701.7882080078125, "completions/min_length": 314.5, "completions/min_terminated_length": 314.5, "epoch": 0.01, "grad_norm": 0.6671625971794128, "kl": 0.0051116943359375, "learning_rate": 1.9333333333333336e-06, "loss": -0.0188, "num_tokens": 1535592.0, "reward": 0.6108183711767197, "reward_std": 0.3935040086507797, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.3811737895011902, "rewards/penalized_accuracy_reward/mean": 0.07006317377090454, "rewards/penalized_accuracy_reward/std": 0.1914493590593338, "rewards/reasoning_steps_reward/mean": 0.3385416828095913, "rewards/reasoning_steps_reward/std": 0.382925845682621, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.23307598009705544, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.25, "completions/mean_length": 760.546875, "completions/mean_terminated_length": 627.3074493408203, "completions/min_length": 243.25, "completions/min_terminated_length": 243.25, "epoch": 0.010333333333333333, "grad_norm": 0.6423367857933044, "kl": 0.006683349609375, "learning_rate": 2.0000000000000003e-06, "loss": 0.0674, "num_tokens": 1597051.0, "reward": 0.5471354126930237, "reward_std": 0.2835099846124649, "rewards/format_reward/mean": 0.546875, "rewards/format_reward/std": 0.5049516260623932, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.5208333358168602, "rewards/reasoning_steps_reward/std": 0.39296089485287666, "rewards/tag_count_reward/mean": 0.6796875, "rewards/tag_count_reward/std": 0.3426893353462219, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1021.75, "completions/max_terminated_length": 937.0, "completions/mean_length": 564.15625, "completions/mean_terminated_length": 526.6151962280273, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.010666666666666666, "grad_norm": 0.8222790956497192, "kl": 0.0097503662109375, "learning_rate": 2.0666666666666666e-06, "loss": -0.002, "num_tokens": 1645301.0, "reward": 0.7227423191070557, "reward_std": 0.41062621772289276, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.42308124154806137, "rewards/penalized_accuracy_reward/mean": 0.1365443915128708, "rewards/penalized_accuracy_reward/std": 0.24466487765312195, "rewards/reasoning_steps_reward/mean": 0.4427083386108279, "rewards/reasoning_steps_reward/std": 0.31103401258587837, "rewards/tag_count_reward/mean": 0.7734375, "rewards/tag_count_reward/std": 0.3284476548433304, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 926.0, "completions/max_terminated_length": 908.75, "completions/mean_length": 638.296875, "completions/mean_terminated_length": 592.3899688720703, "completions/min_length": 232.25, "completions/min_terminated_length": 232.25, "epoch": 0.011, "grad_norm": 0.8139158487319946, "kl": 0.00734710693359375, "learning_rate": 2.133333333333334e-06, "loss": -0.075, "num_tokens": 1695816.0, "reward": 0.5666666626930237, "reward_std": 0.2244843989610672, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4160471484065056, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.3645833358168602, "rewards/reasoning_steps_reward/std": 0.38267721980810165, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.2710249461233616, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 960.5, "completions/max_terminated_length": 829.75, "completions/mean_length": 529.84375, "completions/mean_terminated_length": 506.60001373291016, "completions/min_length": 135.5, "completions/min_terminated_length": 135.5, "epoch": 0.011333333333333334, "grad_norm": 0.817894697189331, "kl": 0.0108642578125, "learning_rate": 2.2e-06, "loss": -0.0127, "num_tokens": 1739294.0, "reward": 0.6891430914402008, "reward_std": 0.38321299850940704, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.34944770485162735, "rewards/penalized_accuracy_reward/mean": 0.07091392576694489, "rewards/penalized_accuracy_reward/std": 0.19396641850471497, "rewards/reasoning_steps_reward/mean": 0.3958333507180214, "rewards/reasoning_steps_reward/std": 0.39766644686460495, "rewards/tag_count_reward/mean": 0.765625, "rewards/tag_count_reward/std": 0.3521217107772827, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 886.5, "completions/max_terminated_length": 739.0, "completions/mean_length": 598.5, "completions/mean_terminated_length": 495.6458511352539, "completions/min_length": 225.5, "completions/min_terminated_length": 225.5, "epoch": 0.011666666666666667, "grad_norm": 0.843439519405365, "kl": 0.00963592529296875, "learning_rate": 2.266666666666667e-06, "loss": 0.0383, "num_tokens": 1787694.0, "reward": 0.6172518730163574, "reward_std": 0.42104343324899673, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.3956565484404564, "rewards/penalized_accuracy_reward/mean": 0.0732414573431015, "rewards/penalized_accuracy_reward/std": 0.20035846531391144, "rewards/reasoning_steps_reward/mean": 0.3958333432674408, "rewards/reasoning_steps_reward/std": 0.3839438855648041, "rewards/tag_count_reward/mean": 0.7734375, "rewards/tag_count_reward/std": 0.3030990958213806, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 931.0, "completions/max_terminated_length": 894.25, "completions/mean_length": 662.046875, "completions/mean_terminated_length": 623.0243072509766, "completions/min_length": 262.75, "completions/min_terminated_length": 262.75, "epoch": 0.012, "grad_norm": 0.7322201728820801, "kl": 0.0060577392578125, "learning_rate": 2.3333333333333336e-06, "loss": 0.0241, "num_tokens": 1839521.0, "reward": 0.6868381351232529, "reward_std": 0.4258001074194908, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.3384781554341316, "rewards/penalized_accuracy_reward/mean": 0.1107964739203453, "rewards/penalized_accuracy_reward/std": 0.23831558227539062, "rewards/reasoning_steps_reward/mean": 0.3177083535119891, "rewards/reasoning_steps_reward/std": 0.32344260439276695, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.26763106137514114, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 733.25, "completions/max_terminated_length": 698.0, "completions/mean_length": 423.625, "completions/mean_terminated_length": 402.15625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.012333333333333333, "grad_norm": 0.9728919863700867, "kl": 0.0107421875, "learning_rate": 2.4000000000000003e-06, "loss": -0.0521, "num_tokens": 1876105.0, "reward": 0.6976470351219177, "reward_std": 0.30700354278087616, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3221946656703949, "rewards/penalized_accuracy_reward/mean": 0.05988661199808121, "rewards/penalized_accuracy_reward/std": 0.1643446534872055, "rewards/reasoning_steps_reward/mean": 0.3958333507180214, "rewards/reasoning_steps_reward/std": 0.38543668389320374, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.1949814110994339, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 957.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 656.578125, "completions/mean_terminated_length": 629.3668365478516, "completions/min_length": 236.5, "completions/min_terminated_length": 236.5, "epoch": 0.012666666666666666, "grad_norm": 0.7981860637664795, "kl": 0.0076751708984375, "learning_rate": 2.466666666666667e-06, "loss": 0.0151, "num_tokens": 1928190.0, "reward": 0.5835937559604645, "reward_std": 0.21724450588226318, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.32438503205776215, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.3281250149011612, "rewards/reasoning_steps_reward/std": 0.400931254029274, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.23317711055278778, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 990.5, "completions/max_terminated_length": 901.0, "completions/mean_length": 615.765625, "completions/mean_terminated_length": 587.3415298461914, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.013, "grad_norm": 0.8061979413032532, "kl": 0.0093231201171875, "learning_rate": 2.5333333333333338e-06, "loss": -0.004, "num_tokens": 1978623.0, "reward": 0.8341648280620575, "reward_std": 0.5322102606296539, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.37585217505693436, "rewards/penalized_accuracy_reward/mean": 0.18755022436380386, "rewards/penalized_accuracy_reward/std": 0.404110312461853, "rewards/reasoning_steps_reward/mean": 0.4635416567325592, "rewards/reasoning_steps_reward/std": 0.377722904086113, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.28358178213238716, "step": 39 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 863.25, "completions/max_terminated_length": 793.0, "completions/mean_length": 629.4375, "completions/mean_terminated_length": 578.8204498291016, "completions/min_length": 294.75, "completions/min_terminated_length": 294.75, "epoch": 0.013333333333333334, "grad_norm": 0.7144984006881714, "kl": 0.0075531005859375, "learning_rate": 2.6e-06, "loss": 0.0337, "num_tokens": 2028779.0, "reward": 0.6130538880825043, "reward_std": 0.3260304667055607, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.2257782220840454, "rewards/penalized_accuracy_reward/mean": 0.03518928587436676, "rewards/penalized_accuracy_reward/std": 0.14075715839862823, "rewards/reasoning_steps_reward/mean": 0.2916666716337204, "rewards/reasoning_steps_reward/std": 0.3576437309384346, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.1747661679983139, "step": 40 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 711.75, "completions/max_terminated_length": 683.0, "completions/mean_length": 428.9375, "completions/mean_terminated_length": 417.7008972167969, "completions/min_length": 186.25, "completions/min_terminated_length": 186.25, "epoch": 0.013666666666666667, "grad_norm": 1.0025185346603394, "kl": 0.014068603515625, "learning_rate": 2.666666666666667e-06, "loss": -0.1125, "num_tokens": 2064119.0, "reward": 0.7020634412765503, "reward_std": 0.31718097999691963, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.18217839300632477, "rewards/penalized_accuracy_reward/mean": 0.034355103969573975, "rewards/penalized_accuracy_reward/std": 0.1374204158782959, "rewards/reasoning_steps_reward/mean": 0.4166666716337204, "rewards/reasoning_steps_reward/std": 0.3688225708901882, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.06649631634354591, "step": 41 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 765.71875, "completions/mean_terminated_length": 634.1123352050781, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.014, "grad_norm": 0.6406546831130981, "kl": 0.00603485107421875, "learning_rate": 2.7333333333333336e-06, "loss": 0.1538, "num_tokens": 2125669.0, "reward": 0.6540641784667969, "reward_std": 0.3556971549987793, "rewards/format_reward/mean": 0.703125, "rewards/format_reward/std": 0.45028156042099, "rewards/penalized_accuracy_reward/mean": 0.03492354974150658, "rewards/penalized_accuracy_reward/std": 0.1396941989660263, "rewards/reasoning_steps_reward/mean": 0.5312500149011612, "rewards/reasoning_steps_reward/std": 0.3982694745063782, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.33274202048778534, "step": 42 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1011.75, "completions/max_terminated_length": 994.25, "completions/mean_length": 819.296875, "completions/mean_terminated_length": 770.6994323730469, "completions/min_length": 529.75, "completions/min_terminated_length": 529.75, "epoch": 0.014333333333333333, "grad_norm": 0.5630760192871094, "kl": 0.0052032470703125, "learning_rate": 2.8000000000000003e-06, "loss": 0.0635, "num_tokens": 2187736.0, "reward": 0.662239596247673, "reward_std": 0.25620727613568306, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4176512807607651, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.526041679084301, "rewards/reasoning_steps_reward/std": 0.3573034182190895, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.26535360887646675, "step": 43 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.25, "completions/mean_length": 883.703125, "completions/mean_terminated_length": 805.6934661865234, "completions/min_length": 515.5, "completions/min_terminated_length": 515.5, "epoch": 0.014666666666666666, "grad_norm": 0.48884493112564087, "kl": 0.005573272705078125, "learning_rate": 2.866666666666667e-06, "loss": 0.0273, "num_tokens": 2253701.0, "reward": 1.0193318128585815, "reward_std": 0.5557461529970169, "rewards/format_reward/mean": 0.59375, "rewards/format_reward/std": 0.4000816270709038, "rewards/penalized_accuracy_reward/mean": 0.37349849939346313, "rewards/penalized_accuracy_reward/std": 0.41204380989074707, "rewards/reasoning_steps_reward/mean": 0.6822916716337204, "rewards/reasoning_steps_reward/std": 0.3374997489154339, "rewards/tag_count_reward/mean": 0.671875, "rewards/tag_count_reward/std": 0.2664684094488621, "step": 44 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 976.0, "completions/max_terminated_length": 902.75, "completions/mean_length": 705.34375, "completions/mean_terminated_length": 608.9917602539062, "completions/min_length": 300.25, "completions/min_terminated_length": 300.25, "epoch": 0.015, "grad_norm": 0.6433221697807312, "kl": 0.00946044921875, "learning_rate": 2.9333333333333338e-06, "loss": 0.037, "num_tokens": 2315707.0, "reward": 0.7077403664588928, "reward_std": 0.40935203433036804, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.41104350984096527, "rewards/penalized_accuracy_reward/mean": 0.06776641309261322, "rewards/penalized_accuracy_reward/std": 0.1868770569562912, "rewards/reasoning_steps_reward/mean": 0.5260416865348816, "rewards/reasoning_steps_reward/std": 0.3749267980456352, "rewards/tag_count_reward/mean": 0.76953125, "rewards/tag_count_reward/std": 0.34914373606443405, "step": 45 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 932.75, "completions/max_terminated_length": 898.0, "completions/mean_length": 656.46875, "completions/mean_terminated_length": 630.1919555664062, "completions/min_length": 415.5, "completions/min_terminated_length": 415.5, "epoch": 0.015333333333333332, "grad_norm": 0.7993950247764587, "kl": 0.0079345703125, "learning_rate": 3e-06, "loss": 0.0068, "num_tokens": 2368185.0, "reward": 0.868732750415802, "reward_std": 0.4208342842757702, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.18217839300632477, "rewards/penalized_accuracy_reward/mean": 0.14789938926696777, "rewards/penalized_accuracy_reward/std": 0.26476314663887024, "rewards/reasoning_steps_reward/mean": 0.5260416939854622, "rewards/reasoning_steps_reward/std": 0.3555455729365349, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.11129852384328842, "step": 46 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 831.25, "completions/max_terminated_length": 774.75, "completions/mean_length": 608.578125, "completions/mean_terminated_length": 544.359375, "completions/min_length": 243.25, "completions/min_terminated_length": 243.25, "epoch": 0.015666666666666666, "grad_norm": 1.5673341751098633, "kl": 0.05785369873046875, "learning_rate": 3.066666666666667e-06, "loss": 0.0451, "num_tokens": 2415102.0, "reward": 0.7765065282583237, "reward_std": 0.3577282950282097, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.3125, "rewards/penalized_accuracy_reward/mean": 0.06830339878797531, "rewards/penalized_accuracy_reward/std": 0.18774420022964478, "rewards/reasoning_steps_reward/mean": 0.6093750223517418, "rewards/reasoning_steps_reward/std": 0.36846283823251724, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.2427973598241806, "step": 47 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.25, "completions/mean_length": 789.46875, "completions/mean_terminated_length": 684.9875183105469, "completions/min_length": 350.75, "completions/min_terminated_length": 350.75, "epoch": 0.016, "grad_norm": 0.5259022116661072, "kl": 0.006622314453125, "learning_rate": 3.133333333333334e-06, "loss": 0.0595, "num_tokens": 2477964.0, "reward": 0.5730468779802322, "reward_std": 0.2987581789493561, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.4622559919953346, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.5625000074505806, "rewards/reasoning_steps_reward/std": 0.3903362527489662, "rewards/tag_count_reward/mean": 0.66796875, "rewards/tag_count_reward/std": 0.3739175945520401, "step": 48 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 915.75, "completions/max_terminated_length": 856.5, "completions/mean_length": 685.359375, "completions/mean_terminated_length": 627.7360687255859, "completions/min_length": 309.25, "completions/min_terminated_length": 309.25, "epoch": 0.01633333333333333, "grad_norm": 0.7897787690162659, "kl": 0.008880615234375, "learning_rate": 3.2000000000000003e-06, "loss": 0.032, "num_tokens": 2530819.0, "reward": 0.6785156428813934, "reward_std": 0.21095674112439156, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.2882782220840454, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.5000000111758709, "rewards/reasoning_steps_reward/std": 0.34909606724977493, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.16805679351091385, "step": 49 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1004.5, "completions/max_terminated_length": 952.5, "completions/mean_length": 680.25, "completions/mean_terminated_length": 654.5104370117188, "completions/min_length": 417.75, "completions/min_terminated_length": 417.75, "epoch": 0.016666666666666666, "grad_norm": 0.7008921504020691, "kl": 0.00914764404296875, "learning_rate": 3.266666666666667e-06, "loss": -0.0159, "num_tokens": 2584419.0, "reward": 0.7334635555744171, "reward_std": 0.20390507578849792, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3375816270709038, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.598958358168602, "rewards/reasoning_steps_reward/std": 0.33922333642840385, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.27025456726551056, "step": 50 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 961.75, "completions/max_terminated_length": 930.0, "completions/mean_length": 763.015625, "completions/mean_terminated_length": 721.3432312011719, "completions/min_length": 441.5, "completions/min_terminated_length": 441.5, "epoch": 0.017, "grad_norm": 0.686704158782959, "kl": 0.00669097900390625, "learning_rate": 3.3333333333333333e-06, "loss": 0.0438, "num_tokens": 2649380.0, "reward": 0.7923340648412704, "reward_std": 0.4311821572482586, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.41419370472431183, "rewards/penalized_accuracy_reward/mean": 0.07644861936569214, "rewards/penalized_accuracy_reward/std": 0.2088974118232727, "rewards/reasoning_steps_reward/mean": 0.6614583507180214, "rewards/reasoning_steps_reward/std": 0.3583720251917839, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.20158234424889088, "step": 51 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 943.75, "completions/max_terminated_length": 938.25, "completions/mean_length": 719.15625, "completions/mean_terminated_length": 695.46875, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.017333333333333333, "grad_norm": 0.6101579070091248, "kl": 0.01004791259765625, "learning_rate": 3.4000000000000005e-06, "loss": -0.0173, "num_tokens": 2702990.0, "reward": 0.7369791567325592, "reward_std": 0.26093084178864956, "rewards/format_reward/mean": 0.671875, "rewards/format_reward/std": 0.36967839300632477, "rewards/penalized_accuracy_reward/mean": 0.01796874962747097, "rewards/penalized_accuracy_reward/std": 0.07187499850988388, "rewards/reasoning_steps_reward/mean": 0.7395833507180214, "rewards/reasoning_steps_reward/std": 0.29277897626161575, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.27829742431640625, "step": 52 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 842.5, "completions/max_terminated_length": 837.25, "completions/mean_length": 655.0, "completions/mean_terminated_length": 639.6964569091797, "completions/min_length": 398.5, "completions/min_terminated_length": 398.5, "epoch": 0.017666666666666667, "grad_norm": 0.7164862751960754, "kl": 0.011871337890625, "learning_rate": 3.4666666666666672e-06, "loss": -0.0006, "num_tokens": 2755854.0, "reward": 0.6813784092664719, "reward_std": 0.3475731834769249, "rewards/format_reward/mean": 0.703125, "rewards/format_reward/std": 0.42046456038951874, "rewards/penalized_accuracy_reward/mean": 0.024607568979263306, "rewards/penalized_accuracy_reward/std": 0.09843027591705322, "rewards/reasoning_steps_reward/mean": 0.6041666865348816, "rewards/reasoning_steps_reward/std": 0.40745963156223297, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.34482838958501816, "step": 53 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 981.25, "completions/max_terminated_length": 956.25, "completions/mean_length": 735.3125, "completions/mean_terminated_length": 696.0399017333984, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.018, "grad_norm": 0.7103943824768066, "kl": 0.010498046875, "learning_rate": 3.5333333333333335e-06, "loss": -0.0591, "num_tokens": 2812194.0, "reward": 0.80135178565979, "reward_std": 0.3634731322526932, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.375, "rewards/penalized_accuracy_reward/mean": 0.05096115358173847, "rewards/penalized_accuracy_reward/std": 0.20384462922811508, "rewards/reasoning_steps_reward/mean": 0.7656250149011612, "rewards/reasoning_steps_reward/std": 0.3409550115466118, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.2768521849066019, "step": 54 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1000.25, "completions/max_terminated_length": 956.75, "completions/mean_length": 624.0, "completions/mean_terminated_length": 603.9885711669922, "completions/min_length": 288.25, "completions/min_terminated_length": 288.25, "epoch": 0.018333333333333333, "grad_norm": 0.840887725353241, "kl": 0.01261138916015625, "learning_rate": 3.6000000000000003e-06, "loss": -0.0931, "num_tokens": 2866658.0, "reward": 0.8203446567058563, "reward_std": 0.3611001707613468, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.34944770485162735, "rewards/penalized_accuracy_reward/mean": 0.07008424401283264, "rewards/penalized_accuracy_reward/std": 0.19155985116958618, "rewards/reasoning_steps_reward/mean": 0.630208358168602, "rewards/reasoning_steps_reward/std": 0.3516792505979538, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.22830459102988243, "step": 55 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.75, "completions/max_terminated_length": 640.75, "completions/mean_length": 446.15625, "completions/mean_terminated_length": 446.15625, "completions/min_length": 296.5, "completions/min_terminated_length": 296.5, "epoch": 0.018666666666666668, "grad_norm": 0.9962576031684875, "kl": 0.020416259765625, "learning_rate": 3.6666666666666666e-06, "loss": -0.0914, "num_tokens": 2906220.0, "reward": 0.7946614772081375, "reward_std": 0.21071942150592804, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.24866948276758194, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.6822916865348816, "rewards/reasoning_steps_reward/std": 0.3173614516854286, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2787376195192337, "step": 56 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 932.5, "completions/max_terminated_length": 837.75, "completions/mean_length": 594.28125, "completions/mean_terminated_length": 565.7655563354492, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.019, "grad_norm": 0.7202043533325195, "kl": 0.0177154541015625, "learning_rate": 3.7333333333333337e-06, "loss": -0.0621, "num_tokens": 2953486.0, "reward": 0.8713552355766296, "reward_std": 0.468075692653656, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.29930340498685837, "rewards/penalized_accuracy_reward/mean": 0.1528656743466854, "rewards/penalized_accuracy_reward/std": 0.3525933623313904, "rewards/reasoning_steps_reward/mean": 0.6510416865348816, "rewards/reasoning_steps_reward/std": 0.35786738246679306, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.24707800149917603, "step": 57 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 826.25, "completions/max_terminated_length": 800.5, "completions/mean_length": 588.96875, "completions/mean_terminated_length": 574.8820190429688, "completions/min_length": 285.5, "completions/min_terminated_length": 285.5, "epoch": 0.019333333333333334, "grad_norm": 0.7140520215034485, "kl": 0.02069091796875, "learning_rate": 3.8000000000000005e-06, "loss": -0.0343, "num_tokens": 3001212.0, "reward": 0.9257438629865646, "reward_std": 0.4081820733845234, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.27156074345111847, "rewards/penalized_accuracy_reward/mean": 0.09058760292828083, "rewards/penalized_accuracy_reward/std": 0.2755988612771034, "rewards/reasoning_steps_reward/mean": 0.7656250298023224, "rewards/reasoning_steps_reward/std": 0.32280419021844864, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.10584449954330921, "step": 58 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 904.75, "completions/max_terminated_length": 853.5, "completions/mean_length": 612.609375, "completions/mean_terminated_length": 584.3977813720703, "completions/min_length": 297.5, "completions/min_terminated_length": 297.5, "epoch": 0.019666666666666666, "grad_norm": 0.6635720133781433, "kl": 0.0224456787109375, "learning_rate": 3.866666666666667e-06, "loss": 0.0093, "num_tokens": 3048851.0, "reward": 0.8723735809326172, "reward_std": 0.27260667085647583, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3265564441680908, "rewards/penalized_accuracy_reward/mean": 0.035785011947155, "rewards/penalized_accuracy_reward/std": 0.14314004778862, "rewards/reasoning_steps_reward/mean": 0.786458358168602, "rewards/reasoning_steps_reward/std": 0.2604687921702862, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.15990673378109932, "step": 59 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 850.25, "completions/max_terminated_length": 823.5, "completions/mean_length": 564.234375, "completions/mean_terminated_length": 533.5127105712891, "completions/min_length": 266.5, "completions/min_terminated_length": 266.5, "epoch": 0.02, "grad_norm": 0.6934652328491211, "kl": 0.030548095703125, "learning_rate": 3.9333333333333335e-06, "loss": -0.0195, "num_tokens": 3093298.0, "reward": 1.116418480873108, "reward_std": 0.5154721215367317, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.36435678601264954, "rewards/penalized_accuracy_reward/mean": 0.2634236477315426, "rewards/penalized_accuracy_reward/std": 0.39374517649412155, "rewards/reasoning_steps_reward/mean": 0.8802083283662796, "rewards/reasoning_steps_reward/std": 0.25018948689103127, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.22917302697896957, "step": 60 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 814.25, "completions/max_terminated_length": 791.25, "completions/mean_length": 554.90625, "completions/mean_terminated_length": 549.4541778564453, "completions/min_length": 225.5, "completions/min_terminated_length": 225.5, "epoch": 0.02033333333333333, "grad_norm": 0.8435772061347961, "kl": 0.02685546875, "learning_rate": 4.000000000000001e-06, "loss": -0.0585, "num_tokens": 3138060.0, "reward": 0.9362861067056656, "reward_std": 0.2588004246354103, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.1875, "rewards/penalized_accuracy_reward/mean": 0.029124625027179718, "rewards/penalized_accuracy_reward/std": 0.11649850755929947, "rewards/reasoning_steps_reward/mean": 0.8541667014360428, "rewards/reasoning_steps_reward/std": 0.26949557289481163, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 61 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 921.75, "completions/max_terminated_length": 904.75, "completions/mean_length": 613.109375, "completions/mean_terminated_length": 601.5496215820312, "completions/min_length": 356.25, "completions/min_terminated_length": 356.25, "epoch": 0.020666666666666667, "grad_norm": 0.7626720070838928, "kl": 0.025482177734375, "learning_rate": 4.066666666666667e-06, "loss": -0.0778, "num_tokens": 3188611.0, "reward": 0.9843322783708572, "reward_std": 0.3281868249177933, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.18616948276758194, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.9010416865348816, "rewards/reasoning_steps_reward/std": 0.2222483716905117, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.12410355359315872, "step": 62 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 897.25, "completions/max_terminated_length": 882.0, "completions/mean_length": 638.796875, "completions/mean_terminated_length": 612.8735809326172, "completions/min_length": 285.75, "completions/min_terminated_length": 285.75, "epoch": 0.021, "grad_norm": 0.7839279770851135, "kl": 0.038116455078125, "learning_rate": 4.133333333333333e-06, "loss": -0.0376, "num_tokens": 3237910.0, "reward": 0.8942708224058151, "reward_std": 0.1893441639840603, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.24467839300632477, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.8854166865348816, "rewards/reasoning_steps_reward/std": 0.24039705470204353, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.09034235030412674, "step": 63 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 797.25, "completions/max_terminated_length": 769.75, "completions/mean_length": 527.171875, "completions/mean_terminated_length": 521.1739654541016, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.021333333333333333, "grad_norm": 0.7821366786956787, "kl": 0.030670166015625, "learning_rate": 4.2000000000000004e-06, "loss": -0.0015, "num_tokens": 3280753.0, "reward": 1.0909536629915237, "reward_std": 0.4102521315217018, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.24866948276758194, "rewards/penalized_accuracy_reward/mean": 0.1860057171434164, "rewards/penalized_accuracy_reward/std": 0.32973112910985947, "rewards/reasoning_steps_reward/mean": 0.895833358168602, "rewards/reasoning_steps_reward/std": 0.1899307519197464, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15195956081151962, "step": 64 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1001.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 681.328125, "completions/mean_terminated_length": 620.4050750732422, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.021666666666666667, "grad_norm": 0.7356759309768677, "kl": 0.02301025390625, "learning_rate": 4.266666666666668e-06, "loss": 0.1044, "num_tokens": 3334966.0, "reward": 0.8970052152872086, "reward_std": 0.18142974004149437, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.3683478757739067, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9479167014360428, "rewards/reasoning_steps_reward/std": 0.1763468012213707, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.1946777980774641, "step": 65 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 958.25, "completions/max_terminated_length": 885.5, "completions/mean_length": 617.75, "completions/mean_terminated_length": 591.3373718261719, "completions/min_length": 338.5, "completions/min_terminated_length": 338.5, "epoch": 0.022, "grad_norm": 0.7183022499084473, "kl": 0.02606201171875, "learning_rate": 4.333333333333334e-06, "loss": 0.1036, "num_tokens": 3383014.0, "reward": 1.533137023448944, "reward_std": 0.23449738323688507, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.2750816270709038, "rewards/penalized_accuracy_reward/mean": 0.6005849502980709, "rewards/penalized_accuracy_reward/std": 0.13191417790949345, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.05442607030272484, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.17777499184012413, "step": 66 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 972.5, "completions/max_terminated_length": 890.5, "completions/mean_length": 744.75, "completions/mean_terminated_length": 720.5144195556641, "completions/min_length": 490.25, "completions/min_terminated_length": 490.25, "epoch": 0.022333333333333334, "grad_norm": 0.6722339987754822, "kl": 0.02935791015625, "learning_rate": 4.4e-06, "loss": 0.0201, "num_tokens": 3442262.0, "reward": 0.9450913518667221, "reward_std": 0.21717733424156904, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.24467839300632477, "rewards/penalized_accuracy_reward/mean": 0.02477882243692875, "rewards/penalized_accuracy_reward/std": 0.099115289747715, "rewards/reasoning_steps_reward/mean": 0.9375000149011612, "rewards/reasoning_steps_reward/std": 0.1712810881435871, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.11574538052082062, "step": 67 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 946.25, "completions/max_terminated_length": 927.0, "completions/mean_length": 760.0, "completions/mean_terminated_length": 719.6666870117188, "completions/min_length": 486.75, "completions/min_terminated_length": 486.75, "epoch": 0.02266666666666667, "grad_norm": 0.497885525226593, "kl": 0.029876708984375, "learning_rate": 4.4666666666666665e-06, "loss": 0.0258, "num_tokens": 3499686.0, "reward": 0.8750000149011612, "reward_std": 0.14313609153032303, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.30717839300632477, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9687500149011612, "rewards/reasoning_steps_reward/std": 0.10621638596057892, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.13139523938298225, "step": 68 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 968.5, "completions/max_terminated_length": 953.0, "completions/mean_length": 754.375, "completions/mean_terminated_length": 725.8952026367188, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 0.023, "grad_norm": 0.6058640480041504, "kl": 0.026092529296875, "learning_rate": 4.533333333333334e-06, "loss": 0.0663, "num_tokens": 3561918.0, "reward": 0.933966264128685, "reward_std": 0.4210771322250366, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.3265564441680908, "rewards/penalized_accuracy_reward/mean": 0.07641417533159256, "rewards/penalized_accuracy_reward/std": 0.30565670132637024, "rewards/reasoning_steps_reward/mean": 0.8854166865348816, "rewards/reasoning_steps_reward/std": 0.22727786377072334, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.19980989769101143, "step": 69 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 892.25, "completions/max_terminated_length": 783.5, "completions/mean_length": 589.015625, "completions/mean_terminated_length": 570.8385620117188, "completions/min_length": 378.5, "completions/min_terminated_length": 378.5, "epoch": 0.023333333333333334, "grad_norm": 0.7925571203231812, "kl": 0.038665771484375, "learning_rate": 4.600000000000001e-06, "loss": 0.0554, "num_tokens": 3609519.0, "reward": 0.9950062930583954, "reward_std": 0.18352606147527695, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.14789126068353653, "rewards/penalized_accuracy_reward/mean": 0.02547505497932434, "rewards/penalized_accuracy_reward/std": 0.10190021991729736, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.0624999962747097, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1292813941836357, "step": 70 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1017.5, "completions/max_terminated_length": 995.5, "completions/mean_length": 649.046875, "completions/mean_terminated_length": 618.1365280151367, "completions/min_length": 380.5, "completions/min_terminated_length": 380.5, "epoch": 0.023666666666666666, "grad_norm": 0.7724472284317017, "kl": 0.03399658203125, "learning_rate": 4.666666666666667e-06, "loss": 0.108, "num_tokens": 3661650.0, "reward": 0.9970489591360092, "reward_std": 0.30074702948331833, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3450859263539314, "rewards/penalized_accuracy_reward/mean": 0.06775209307670593, "rewards/penalized_accuracy_reward/std": 0.18513934314250946, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.0624999962747097, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1695580966770649, "step": 71 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 933.75, "completions/max_terminated_length": 917.0, "completions/mean_length": 641.21875, "completions/mean_terminated_length": 631.3729400634766, "completions/min_length": 379.75, "completions/min_terminated_length": 379.75, "epoch": 0.024, "grad_norm": 0.5506178736686707, "kl": 0.03662109375, "learning_rate": 4.7333333333333335e-06, "loss": -0.0149, "num_tokens": 3713456.0, "reward": 1.0036645531654358, "reward_std": 0.20802644453942776, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.14789126068353653, "rewards/penalized_accuracy_reward/mean": 0.03712809830904007, "rewards/penalized_accuracy_reward/std": 0.14851240813732147, "rewards/reasoning_steps_reward/mean": 0.9791666865348816, "rewards/reasoning_steps_reward/std": 0.07013041526079178, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11938536167144775, "step": 72 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.5, "completions/max_terminated_length": 698.5, "completions/mean_length": 423.875, "completions/mean_terminated_length": 423.875, "completions/min_length": 269.25, "completions/min_terminated_length": 269.25, "epoch": 0.024333333333333332, "grad_norm": 0.859659731388092, "kl": 0.04425048828125, "learning_rate": 4.800000000000001e-06, "loss": 0.0026, "num_tokens": 3748776.0, "reward": 1.2820918262004852, "reward_std": 0.6469196081161499, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.3676386624574661, "rewards/penalized_accuracy_reward/std": 0.6582163870334625, "rewards/reasoning_steps_reward/mean": 0.859375, "rewards/reasoning_steps_reward/std": 0.2520834319293499, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07966844737529755, "step": 73 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.5, "completions/max_terminated_length": 763.5, "completions/mean_length": 542.515625, "completions/mean_terminated_length": 542.515625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.024666666666666667, "grad_norm": 0.5363684892654419, "kl": 0.0386962890625, "learning_rate": 4.866666666666667e-06, "loss": -0.0286, "num_tokens": 3794937.0, "reward": 1.0115860998630524, "reward_std": 0.21859073173254728, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0546850711107254, "rewards/penalized_accuracy_reward/std": 0.15053680539131165, "rewards/reasoning_steps_reward/mean": 0.9270833432674408, "rewards/reasoning_steps_reward/std": 0.1448565348982811, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 74 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 778.5, "completions/max_terminated_length": 766.5, "completions/mean_length": 561.4375, "completions/mean_terminated_length": 557.7208404541016, "completions/min_length": 328.75, "completions/min_terminated_length": 328.75, "epoch": 0.025, "grad_norm": 0.8160569667816162, "kl": 0.041351318359375, "learning_rate": 4.933333333333334e-06, "loss": 0.021, "num_tokens": 3839589.0, "reward": 1.1652396470308304, "reward_std": 0.28621215745806694, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.25, "rewards/penalized_accuracy_reward/mean": 0.27513551712036133, "rewards/penalized_accuracy_reward/std": 0.16930751502513885, "rewards/reasoning_steps_reward/mean": 0.833333358168602, "rewards/reasoning_steps_reward/std": 0.19271302968263626, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 75 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 449.75, "completions/mean_terminated_length": 449.75, "completions/min_length": 257.5, "completions/min_terminated_length": 257.5, "epoch": 0.025333333333333333, "grad_norm": 0.8491156697273254, "kl": 0.05242919921875, "learning_rate": 5e-06, "loss": -0.0402, "num_tokens": 3879173.0, "reward": 0.9087462574243546, "reward_std": 0.22535051591694355, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.2050696536898613, "rewards/penalized_accuracy_reward/mean": 0.024371251463890076, "rewards/penalized_accuracy_reward/std": 0.0974850058555603, "rewards/reasoning_steps_reward/mean": 0.8750000149011612, "rewards/reasoning_steps_reward/std": 0.18622694537043571, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.21993406862020493, "step": 76 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 466.328125, "completions/mean_terminated_length": 466.328125, "completions/min_length": 281.25, "completions/min_terminated_length": 281.25, "epoch": 0.025666666666666667, "grad_norm": 0.913472056388855, "kl": 0.046905517578125, "learning_rate": 5.0666666666666676e-06, "loss": 0.0071, "num_tokens": 3920298.0, "reward": 0.9114020764827728, "reward_std": 0.2558419294655323, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3058478757739067, "rewards/penalized_accuracy_reward/mean": 0.028198951855301857, "rewards/penalized_accuracy_reward/std": 0.11279580742120743, "rewards/reasoning_steps_reward/mean": 0.9062500149011612, "rewards/reasoning_steps_reward/std": 0.17976614087820053, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1920349784195423, "step": 77 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 699.75, "completions/max_terminated_length": 552.0, "completions/mean_length": 378.46875, "completions/mean_terminated_length": 350.8145980834961, "completions/min_length": 186.75, "completions/min_terminated_length": 186.75, "epoch": 0.026, "grad_norm": 1.1382102966308594, "kl": 0.0594482421875, "learning_rate": 5.133333333333334e-06, "loss": 0.1165, "num_tokens": 3953512.0, "reward": 0.8847656399011612, "reward_std": 0.16677778400480747, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29578252136707306, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.8593750298023224, "rewards/reasoning_steps_reward/std": 0.19023765996098518, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.20477662421762943, "step": 78 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.75, "completions/max_terminated_length": 744.75, "completions/mean_length": 499.765625, "completions/mean_terminated_length": 499.765625, "completions/min_length": 295.75, "completions/min_terminated_length": 295.75, "epoch": 0.026333333333333334, "grad_norm": 0.8621724843978882, "kl": 0.0494384765625, "learning_rate": 5.2e-06, "loss": 0.0036, "num_tokens": 3996825.0, "reward": 0.9801009744405746, "reward_std": 0.2303646355867386, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.1875, "rewards/penalized_accuracy_reward/mean": 0.033877044916152954, "rewards/penalized_accuracy_reward/std": 0.135508194565773, "rewards/reasoning_steps_reward/mean": 0.9322916865348816, "rewards/reasoning_steps_reward/std": 0.1747187376022339, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 79 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 712.5, "completions/max_terminated_length": 661.5, "completions/mean_length": 467.015625, "completions/mean_terminated_length": 460.1291809082031, "completions/min_length": 278.75, "completions/min_terminated_length": 278.75, "epoch": 0.02666666666666667, "grad_norm": 0.8190663456916809, "kl": 0.06005859375, "learning_rate": 5.2666666666666665e-06, "loss": -0.0029, "num_tokens": 4036218.0, "reward": 1.0388395190238953, "reward_std": 0.3393943142145872, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.08539126068353653, "rewards/penalized_accuracy_reward/mean": 0.093787407502532, "rewards/penalized_accuracy_reward/std": 0.27905965596437454, "rewards/reasoning_steps_reward/mean": 0.9166666716337204, "rewards/reasoning_steps_reward/std": 0.1558472253382206, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.021347815170884132, "step": 80 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 749.25, "completions/max_terminated_length": 721.25, "completions/mean_length": 470.453125, "completions/mean_terminated_length": 463.5166778564453, "completions/min_length": 237.75, "completions/min_terminated_length": 237.75, "epoch": 0.027, "grad_norm": 0.7197569012641907, "kl": 0.0582275390625, "learning_rate": 5.333333333333334e-06, "loss": 0.0067, "num_tokens": 4080423.0, "reward": 1.0867950171232224, "reward_std": 0.3874451629817486, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.14789126068353653, "rewards/penalized_accuracy_reward/mean": 0.12611790746450424, "rewards/penalized_accuracy_reward/std": 0.3361714631319046, "rewards/reasoning_steps_reward/mean": 0.9635416865348816, "rewards/reasoning_steps_reward/std": 0.10776668787002563, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.08384781517088413, "step": 81 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 781.25, "completions/max_terminated_length": 774.25, "completions/mean_length": 486.921875, "completions/mean_terminated_length": 479.765625, "completions/min_length": 237.25, "completions/min_terminated_length": 237.25, "epoch": 0.027333333333333334, "grad_norm": 0.7567655444145203, "kl": 0.062744140625, "learning_rate": 5.400000000000001e-06, "loss": -0.0118, "num_tokens": 4121794.0, "reward": 1.0785975009202957, "reward_std": 0.4485790819162503, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.14789126068353653, "rewards/penalized_accuracy_reward/mean": 0.15333710610866547, "rewards/penalized_accuracy_reward/std": 0.34840136766433716, "rewards/reasoning_steps_reward/mean": 0.8958333432674408, "rewards/reasoning_steps_reward/std": 0.17417392134666443, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.10694126039743423, "step": 82 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.25, "completions/max_terminated_length": 732.25, "completions/mean_length": 482.203125, "completions/mean_terminated_length": 482.203125, "completions/min_length": 227.5, "completions/min_terminated_length": 227.5, "epoch": 0.027666666666666666, "grad_norm": 0.7868958115577698, "kl": 0.06304931640625, "learning_rate": 5.466666666666667e-06, "loss": 0.0333, "num_tokens": 4163967.0, "reward": 1.3195671439170837, "reward_std": 0.5870583718642592, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.3366243988275528, "rewards/penalized_accuracy_reward/std": 0.538981094956398, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 83 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 846.0, "completions/max_terminated_length": 820.75, "completions/mean_length": 528.234375, "completions/mean_terminated_length": 521.2843780517578, "completions/min_length": 270.25, "completions/min_terminated_length": 270.25, "epoch": 0.028, "grad_norm": 0.6654375195503235, "kl": 0.05926513671875, "learning_rate": 5.533333333333334e-06, "loss": -0.0567, "num_tokens": 4208846.0, "reward": 1.0562476068735123, "reward_std": 0.35241691023111343, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.09452884644269943, "rewards/penalized_accuracy_reward/std": 0.2987591028213501, "rewards/reasoning_steps_reward/mean": 0.9375000149011612, "rewards/reasoning_steps_reward/std": 0.17659492790699005, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 84 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 892.5, "completions/max_terminated_length": 805.0, "completions/mean_length": 576.078125, "completions/mean_terminated_length": 568.4114685058594, "completions/min_length": 329.5, "completions/min_terminated_length": 329.5, "epoch": 0.028333333333333332, "grad_norm": 0.8431299328804016, "kl": 0.0531005859375, "learning_rate": 5.600000000000001e-06, "loss": 0.0185, "num_tokens": 4259827.0, "reward": 0.9817708432674408, "reward_std": 0.06735242495778948, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11091229319572449, "step": 85 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 640.75, "completions/max_terminated_length": 595.5, "completions/mean_length": 413.34375, "completions/mean_terminated_length": 407.17189025878906, "completions/min_length": 213.25, "completions/min_terminated_length": 213.25, "epoch": 0.028666666666666667, "grad_norm": 0.8590614795684814, "kl": 0.07122802734375, "learning_rate": 5.666666666666667e-06, "loss": 0.0421, "num_tokens": 4296057.0, "reward": 1.1069028824567795, "reward_std": 0.3679976146668196, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.21039126068353653, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.911458358168602, "rewards/reasoning_steps_reward/std": 0.2015480175614357, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06524410098791122, "step": 86 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 989.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 705.484375, "completions/mean_terminated_length": 672.3079986572266, "completions/min_length": 404.5, "completions/min_terminated_length": 404.5, "epoch": 0.029, "grad_norm": 0.5832537412643433, "kl": 0.047698974609375, "learning_rate": 5.733333333333334e-06, "loss": 0.0553, "num_tokens": 4352808.0, "reward": 0.9296875, "reward_std": 0.10884983465075493, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.21347815543413162, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.04929708316922188, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.12680982053279877, "step": 87 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 911.5, "completions/max_terminated_length": 840.25, "completions/mean_length": 578.09375, "completions/mean_terminated_length": 570.7739715576172, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.029333333333333333, "grad_norm": 0.8061636686325073, "kl": 0.06048583984375, "learning_rate": 5.8e-06, "loss": -0.0503, "num_tokens": 4400094.0, "reward": 1.097521647810936, "reward_std": 0.2972461935132742, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.1558550000190735, "rewards/penalized_accuracy_reward/std": 0.18377679586410522, "rewards/reasoning_steps_reward/mean": 0.911458358168602, "rewards/reasoning_steps_reward/std": 0.20518534630537033, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 88 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 875.25, "completions/max_terminated_length": 813.0, "completions/mean_length": 573.25, "completions/mean_terminated_length": 557.3623657226562, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.029666666666666668, "grad_norm": 0.8729704022407532, "kl": 0.05291748046875, "learning_rate": 5.8666666666666675e-06, "loss": -0.0207, "num_tokens": 4448718.0, "reward": 0.9381932765245438, "reward_std": 0.2505082078278065, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.31116948276758194, "rewards/penalized_accuracy_reward/mean": 0.029729731380939484, "rewards/penalized_accuracy_reward/std": 0.11891893297433853, "rewards/reasoning_steps_reward/mean": 0.9114583432674408, "rewards/reasoning_steps_reward/std": 0.14012115448713303, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10485684871673584, "step": 89 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 947.5, "completions/max_terminated_length": 923.25, "completions/mean_length": 628.734375, "completions/mean_terminated_length": 622.8916778564453, "completions/min_length": 381.75, "completions/min_terminated_length": 381.75, "epoch": 0.03, "grad_norm": 0.705189049243927, "kl": 0.0528564453125, "learning_rate": 5.933333333333335e-06, "loss": 0.0152, "num_tokens": 4500989.0, "reward": 0.9658854156732559, "reward_std": 0.10832381062209606, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.1875, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.973958358168602, "rewards/reasoning_steps_reward/std": 0.09096374735236168, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09375, "step": 90 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1011.75, "completions/max_terminated_length": 1001.0, "completions/mean_length": 716.953125, "completions/mean_terminated_length": 704.4674224853516, "completions/min_length": 464.75, "completions/min_terminated_length": 464.75, "epoch": 0.030333333333333334, "grad_norm": 0.6014336347579956, "kl": 0.04913330078125, "learning_rate": 6e-06, "loss": 0.0312, "num_tokens": 4559210.0, "reward": 1.0703469514846802, "reward_std": 0.3709853794425726, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.2640564441680908, "rewards/penalized_accuracy_reward/mean": 0.13141468167304993, "rewards/penalized_accuracy_reward/std": 0.2352112978696823, "rewards/reasoning_steps_reward/mean": 0.973958358168602, "rewards/reasoning_steps_reward/std": 0.1041666604578495, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.10916591435670853, "step": 91 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 875.75, "completions/max_terminated_length": 857.75, "completions/mean_length": 622.109375, "completions/mean_terminated_length": 608.2596282958984, "completions/min_length": 389.25, "completions/min_terminated_length": 389.25, "epoch": 0.030666666666666665, "grad_norm": 0.699589729309082, "kl": 0.0509033203125, "learning_rate": 6.066666666666667e-06, "loss": 0.0111, "num_tokens": 4606721.0, "reward": 1.1923158913850784, "reward_std": 0.35641849786043167, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.23680340498685837, "rewards/penalized_accuracy_reward/mean": 0.24192526936531067, "rewards/penalized_accuracy_reward/std": 0.28356412053108215, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.03359273821115494, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14332501962780952, "step": 92 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 940.5, "completions/max_terminated_length": 908.25, "completions/mean_length": 607.8125, "completions/mean_terminated_length": 602.2208404541016, "completions/min_length": 317.75, "completions/min_terminated_length": 317.75, "epoch": 0.031, "grad_norm": 1.3742492198944092, "kl": 0.0843505859375, "learning_rate": 6.133333333333334e-06, "loss": -0.0486, "num_tokens": 4656085.0, "reward": 0.9291666746139526, "reward_std": 0.14688335917890072, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.23328252136707306, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.927083358168602, "rewards/reasoning_steps_reward/std": 0.14779141545295715, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10037772543728352, "step": 93 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 874.75, "completions/max_terminated_length": 818.25, "completions/mean_length": 534.53125, "completions/mean_terminated_length": 527.3885498046875, "completions/min_length": 350.25, "completions/min_terminated_length": 350.25, "epoch": 0.03133333333333333, "grad_norm": 0.8657882809638977, "kl": 0.06103515625, "learning_rate": 6.200000000000001e-06, "loss": 0.0349, "num_tokens": 4701863.0, "reward": 0.9688801914453506, "reward_std": 0.10679070092737675, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.1875, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.06615880131721497, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 94 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 866.25, "completions/max_terminated_length": 838.75, "completions/mean_length": 599.34375, "completions/mean_terminated_length": 593.5364685058594, "completions/min_length": 338.75, "completions/min_terminated_length": 338.75, "epoch": 0.03166666666666667, "grad_norm": 0.7609896063804626, "kl": 0.06109619140625, "learning_rate": 6.266666666666668e-06, "loss": -0.011, "num_tokens": 4748829.0, "reward": 0.9533854424953461, "reward_std": 0.11424364056438208, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2257782220840454, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.08699213340878487, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07206955552101135, "step": 95 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.75, "completions/max_terminated_length": 840.75, "completions/mean_length": 591.15625, "completions/mean_terminated_length": 591.15625, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.032, "grad_norm": 0.7026406526565552, "kl": 0.05560302734375, "learning_rate": 6.333333333333333e-06, "loss": -0.0253, "num_tokens": 4796583.0, "reward": 1.0508922636508942, "reward_std": 0.2978524469071999, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.14789126068353653, "rewards/penalized_accuracy_reward/mean": 0.1111786812543869, "rewards/penalized_accuracy_reward/std": 0.19907879829406738, "rewards/reasoning_steps_reward/mean": 0.9270833432674408, "rewards/reasoning_steps_reward/std": 0.12794098258018494, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.12456496804952621, "step": 96 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.25, "completions/max_terminated_length": 809.25, "completions/mean_length": 510.171875, "completions/mean_terminated_length": 510.171875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.03233333333333333, "grad_norm": 0.8602376580238342, "kl": 0.0557861328125, "learning_rate": 6.4000000000000006e-06, "loss": -0.0679, "num_tokens": 4844402.0, "reward": 1.120245411992073, "reward_std": 0.32840642519295216, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.17519332468509674, "rewards/penalized_accuracy_reward/std": 0.23455476760864258, "rewards/reasoning_steps_reward/mean": 0.9166666865348816, "rewards/reasoning_steps_reward/std": 0.19030534476041794, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 97 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 565.53125, "completions/mean_terminated_length": 565.53125, "completions/min_length": 272.5, "completions/min_terminated_length": 272.5, "epoch": 0.03266666666666666, "grad_norm": 0.4225853979587555, "kl": 0.05230712890625, "learning_rate": 6.466666666666667e-06, "loss": -0.0753, "num_tokens": 4891300.0, "reward": 0.9843750149011612, "reward_std": 0.04488958604633808, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9687500149011612, "rewards/reasoning_steps_reward/std": 0.08977919071912766, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 98 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.25, "completions/max_terminated_length": 699.25, "completions/mean_length": 452.25, "completions/mean_terminated_length": 452.25, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.033, "grad_norm": 0.8484078049659729, "kl": 0.05780029296875, "learning_rate": 6.533333333333334e-06, "loss": -0.1158, "num_tokens": 4930772.0, "reward": 1.023941695690155, "reward_std": 0.24003357347100973, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.07224894315004349, "rewards/penalized_accuracy_reward/std": 0.15540502965450287, "rewards/reasoning_steps_reward/mean": 0.9166667014360428, "rewards/reasoning_steps_reward/std": 0.19612576067447662, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 99 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 823.75, "completions/max_terminated_length": 813.0, "completions/mean_length": 513.296875, "completions/mean_terminated_length": 507.20314025878906, "completions/min_length": 221.75, "completions/min_terminated_length": 221.75, "epoch": 0.03333333333333333, "grad_norm": 0.8136687874794006, "kl": 0.049072265625, "learning_rate": 6.600000000000001e-06, "loss": -0.0653, "num_tokens": 4972919.0, "reward": 0.8863281309604645, "reward_std": 0.17739969119429588, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.23328252136707306, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.8437500149011612, "rewards/reasoning_steps_reward/std": 0.22764474898576736, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15899410098791122, "step": 100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.25, "completions/max_terminated_length": 705.25, "completions/mean_length": 473.109375, "completions/mean_terminated_length": 473.109375, "completions/min_length": 202.25, "completions/min_terminated_length": 202.25, "epoch": 0.033666666666666664, "grad_norm": 0.9104421138763428, "kl": 0.05877685546875, "learning_rate": 6.666666666666667e-06, "loss": -0.0692, "num_tokens": 5011790.0, "reward": 1.0768784284591675, "reward_std": 0.5057692248374224, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.1875, "rewards/penalized_accuracy_reward/mean": 0.19367528706789017, "rewards/penalized_accuracy_reward/std": 0.417163223028183, "rewards/reasoning_steps_reward/mean": 0.8125000149011612, "rewards/reasoning_steps_reward/std": 0.2908661887049675, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14216844737529755, "step": 101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 833.0, "completions/max_terminated_length": 772.25, "completions/mean_length": 542.375, "completions/mean_terminated_length": 528.0625, "completions/min_length": 267.75, "completions/min_terminated_length": 267.75, "epoch": 0.034, "grad_norm": 0.7064374089241028, "kl": 0.05255126953125, "learning_rate": 6.733333333333334e-06, "loss": 0.0097, "num_tokens": 5055558.0, "reward": 0.9852046072483063, "reward_std": 0.20568905398249626, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.21039126068353653, "rewards/penalized_accuracy_reward/mean": 0.032470230013132095, "rewards/penalized_accuracy_reward/std": 0.12988092005252838, "rewards/reasoning_steps_reward/mean": 0.96875, "rewards/reasoning_steps_reward/std": 0.10782546922564507, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18054034188389778, "step": 102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 952.0, "completions/max_terminated_length": 939.5, "completions/mean_length": 630.1875, "completions/mean_terminated_length": 595.8125, "completions/min_length": 232.25, "completions/min_terminated_length": 232.25, "epoch": 0.034333333333333334, "grad_norm": 0.6197292804718018, "kl": 0.0474853515625, "learning_rate": 6.800000000000001e-06, "loss": -0.0475, "num_tokens": 5107490.0, "reward": 0.9695312529802322, "reward_std": 0.20866616070270538, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.18217839300632477, "rewards/penalized_accuracy_reward/mean": 0.02734375, "rewards/penalized_accuracy_reward/std": 0.1093750074505806, "rewards/reasoning_steps_reward/mean": 0.9687500149011612, "rewards/reasoning_steps_reward/std": 0.10782546550035477, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.11353103816509247, "step": 103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 818.5, "completions/max_terminated_length": 762.5, "completions/mean_length": 550.09375, "completions/mean_terminated_length": 541.9562530517578, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.034666666666666665, "grad_norm": 0.5912279486656189, "kl": 0.05926513671875, "learning_rate": 6.866666666666667e-06, "loss": -0.0204, "num_tokens": 5151560.0, "reward": 1.1052062809467316, "reward_std": 0.4189284183084965, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.08539126068353653, "rewards/penalized_accuracy_reward/mean": 0.13932083547115326, "rewards/penalized_accuracy_reward/std": 0.38097113370895386, "rewards/reasoning_steps_reward/mean": 0.9583333432674408, "rewards/reasoning_steps_reward/std": 0.09622505307197571, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.021347815170884132, "step": 104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 923.75, "completions/max_terminated_length": 917.75, "completions/mean_length": 665.375, "completions/mean_terminated_length": 657.4486694335938, "completions/min_length": 394.75, "completions/min_terminated_length": 394.75, "epoch": 0.035, "grad_norm": 0.6213710308074951, "kl": 0.049072265625, "learning_rate": 6.9333333333333344e-06, "loss": 0.0131, "num_tokens": 5204944.0, "reward": 0.9998129159212112, "reward_std": 0.159408881329, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.10077822208404541, "rewards/penalized_accuracy_reward/mean": 0.024943124502897263, "rewards/penalized_accuracy_reward/std": 0.09977250546216965, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.025194555521011353, "step": 105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 604.21875, "completions/mean_terminated_length": 604.21875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.035333333333333335, "grad_norm": 0.749445915222168, "kl": 0.05389404296875, "learning_rate": 7e-06, "loss": -0.1121, "num_tokens": 5254142.0, "reward": 1.0192549675703049, "reward_std": 0.2774948216974735, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.060921624302864075, "rewards/penalized_accuracy_reward/std": 0.2436865046620369, "rewards/reasoning_steps_reward/mean": 0.9479166865348816, "rewards/reasoning_steps_reward/std": 0.14323293417692184, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.125, "step": 106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 948.75, "completions/max_terminated_length": 925.75, "completions/mean_length": 705.203125, "completions/mean_terminated_length": 693.4765014648438, "completions/min_length": 479.5, "completions/min_terminated_length": 479.5, "epoch": 0.035666666666666666, "grad_norm": 0.6594750285148621, "kl": 0.05364990234375, "learning_rate": 7.066666666666667e-06, "loss": 0.0348, "num_tokens": 5308507.0, "reward": 1.0982975512742996, "reward_std": 0.3949567638337612, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.2257782220840454, "rewards/penalized_accuracy_reward/mean": 0.14165689051151276, "rewards/penalized_accuracy_reward/std": 0.31518884748220444, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.0624999962747097, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14216844737529755, "step": 107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 938.75, "completions/max_terminated_length": 919.0, "completions/mean_length": 693.21875, "completions/mean_terminated_length": 689.9916839599609, "completions/min_length": 470.5, "completions/min_terminated_length": 470.5, "epoch": 0.036, "grad_norm": 0.6489946842193604, "kl": 0.0594482421875, "learning_rate": 7.133333333333334e-06, "loss": 0.0069, "num_tokens": 5360985.0, "reward": 0.936531126499176, "reward_std": 0.27230495028197765, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.34944770485162735, "rewards/penalized_accuracy_reward/mean": 0.03379674255847931, "rewards/penalized_accuracy_reward/std": 0.13518697023391724, "rewards/reasoning_steps_reward/mean": 0.9375000149011612, "rewards/reasoning_steps_reward/std": 0.152117520570755, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.26711349189281464, "step": 108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 747.75, "completions/max_terminated_length": 729.0, "completions/mean_length": 517.265625, "completions/mean_terminated_length": 512.7416687011719, "completions/min_length": 343.75, "completions/min_terminated_length": 343.75, "epoch": 0.036333333333333336, "grad_norm": 0.7832958102226257, "kl": 0.0709228515625, "learning_rate": 7.2000000000000005e-06, "loss": 0.008, "num_tokens": 5402474.0, "reward": 1.364576980471611, "reward_std": 0.522902300581336, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.21039126068353653, "rewards/penalized_accuracy_reward/mean": 0.4221290349960327, "rewards/penalized_accuracy_reward/std": 0.47028493881225586, "rewards/reasoning_steps_reward/mean": 0.9427083432674408, "rewards/reasoning_steps_reward/std": 0.11986106634140015, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14152991026639938, "step": 109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 924.0, "completions/max_terminated_length": 888.25, "completions/mean_length": 714.40625, "completions/mean_terminated_length": 688.4537963867188, "completions/min_length": 413.75, "completions/min_terminated_length": 413.75, "epoch": 0.03666666666666667, "grad_norm": 0.7029095888137817, "kl": 0.05279541015625, "learning_rate": 7.266666666666668e-06, "loss": 0.0519, "num_tokens": 5461796.0, "reward": 0.9207031279802322, "reward_std": 0.1675838977098465, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.33539126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.04929708316922188, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.22450439631938934, "step": 110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 986.0, "completions/max_terminated_length": 947.25, "completions/mean_length": 736.84375, "completions/mean_terminated_length": 703.7214202880859, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.037, "grad_norm": 0.6748602986335754, "kl": 0.05230712890625, "learning_rate": 7.333333333333333e-06, "loss": 0.0767, "num_tokens": 5517274.0, "reward": 0.9266927242279053, "reward_std": 0.15901001170277596, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3300696536898613, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.0833333320915699, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1924012266099453, "step": 111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 940.75, "completions/max_terminated_length": 896.75, "completions/mean_length": 634.515625, "completions/mean_terminated_length": 627.9427185058594, "completions/min_length": 337.75, "completions/min_terminated_length": 337.75, "epoch": 0.037333333333333336, "grad_norm": 0.7816139459609985, "kl": 0.06292724609375, "learning_rate": 7.4e-06, "loss": -0.05, "num_tokens": 5566667.0, "reward": 1.1322840005159378, "reward_std": 0.37390279583632946, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.1632782220840454, "rewards/penalized_accuracy_reward/mean": 0.18606003746390343, "rewards/penalized_accuracy_reward/std": 0.2862424701452255, "rewards/reasoning_steps_reward/mean": 0.9479166716337204, "rewards/reasoning_steps_reward/std": 0.14323293790221214, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07966844737529755, "step": 112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 801.75, "completions/max_terminated_length": 768.75, "completions/mean_length": 557.125, "completions/mean_terminated_length": 550.9229278564453, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.03766666666666667, "grad_norm": 0.710523247718811, "kl": 0.08251953125, "learning_rate": 7.4666666666666675e-06, "loss": 0.0007, "num_tokens": 5611475.0, "reward": 1.057383418083191, "reward_std": 0.24412458762526512, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.08539126068353653, "rewards/penalized_accuracy_reward/mean": 0.08485737442970276, "rewards/penalized_accuracy_reward/std": 0.1838604211807251, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.09096375107765198, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06340491026639938, "step": 113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1000.5, "completions/max_terminated_length": 932.5, "completions/mean_length": 744.984375, "completions/mean_terminated_length": 718.1194305419922, "completions/min_length": 399.75, "completions/min_terminated_length": 399.75, "epoch": 0.038, "grad_norm": 0.7475992441177368, "kl": 0.06201171875, "learning_rate": 7.533333333333334e-06, "loss": 0.0666, "num_tokens": 5670786.0, "reward": 0.993861049413681, "reward_std": 0.2636701911687851, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.2750816270709038, "rewards/penalized_accuracy_reward/mean": 0.05232458561658859, "rewards/penalized_accuracy_reward/std": 0.14314451813697815, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.13477232307195663, "step": 114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 977.0, "completions/max_terminated_length": 938.75, "completions/mean_length": 711.25, "completions/mean_terminated_length": 691.9525451660156, "completions/min_length": 438.5, "completions/min_terminated_length": 438.5, "epoch": 0.03833333333333333, "grad_norm": 0.7733854651451111, "kl": 0.0596923828125, "learning_rate": 7.600000000000001e-06, "loss": 0.0926, "num_tokens": 5730226.0, "reward": 0.960807278752327, "reward_std": 0.12203849479556084, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27289126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.08659191615879536, "step": 115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 929.0, "completions/max_terminated_length": 911.25, "completions/mean_length": 694.640625, "completions/mean_terminated_length": 666.6000061035156, "completions/min_length": 331.5, "completions/min_terminated_length": 331.5, "epoch": 0.03866666666666667, "grad_norm": 0.5846694111824036, "kl": 0.06732177734375, "learning_rate": 7.666666666666667e-06, "loss": -0.0038, "num_tokens": 5783451.0, "reward": 0.9777296334505081, "reward_std": 0.20587004628032446, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.1280868947505951, "rewards/penalized_accuracy_reward/mean": 0.03437023237347603, "rewards/penalized_accuracy_reward/std": 0.1374809294939041, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.04929708316922188, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.0654262900352478, "step": 116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 950.25, "completions/max_terminated_length": 907.75, "completions/mean_length": 673.171875, "completions/mean_terminated_length": 658.1291809082031, "completions/min_length": 412.25, "completions/min_terminated_length": 412.25, "epoch": 0.039, "grad_norm": 0.8333153128623962, "kl": 0.071044921875, "learning_rate": 7.733333333333334e-06, "loss": -0.0128, "num_tokens": 5837190.0, "reward": 1.3094747960567474, "reward_std": 0.6067601572722197, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.1875, "rewards/penalized_accuracy_reward/mean": 0.3357768412679434, "rewards/penalized_accuracy_reward/std": 0.5968192145228386, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.08086910098791122, "step": 117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.25, "completions/max_terminated_length": 906.25, "completions/mean_length": 637.984375, "completions/mean_terminated_length": 637.984375, "completions/min_length": 362.75, "completions/min_terminated_length": 362.75, "epoch": 0.03933333333333333, "grad_norm": 0.3177330195903778, "kl": 0.0679931640625, "learning_rate": 7.800000000000002e-06, "loss": -0.0297, "num_tokens": 5889061.0, "reward": 0.9921875, "reward_std": 0.022662729024887085, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.045325469225645065, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 960.75, "completions/max_terminated_length": 933.5, "completions/mean_length": 682.9375, "completions/mean_terminated_length": 671.2948150634766, "completions/min_length": 391.5, "completions/min_terminated_length": 391.5, "epoch": 0.03966666666666667, "grad_norm": 0.8171834945678711, "kl": 0.06939697265625, "learning_rate": 7.866666666666667e-06, "loss": 0.0615, "num_tokens": 5943329.0, "reward": 1.051426261663437, "reward_std": 0.2572258897125721, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.21039126068353653, "rewards/penalized_accuracy_reward/mean": 0.07798877358436584, "rewards/penalized_accuracy_reward/std": 0.1678096055984497, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.05259781517088413, "step": 119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.5, "completions/max_terminated_length": 819.5, "completions/mean_length": 581.84375, "completions/mean_terminated_length": 581.84375, "completions/min_length": 377.5, "completions/min_terminated_length": 377.5, "epoch": 0.04, "grad_norm": 0.6569817662239075, "kl": 0.07470703125, "learning_rate": 7.933333333333334e-06, "loss": -0.0118, "num_tokens": 5989639.0, "reward": 1.2491122335195541, "reward_std": 0.22335275262594223, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.14789126068353653, "rewards/penalized_accuracy_reward/mean": 0.280622661113739, "rewards/penalized_accuracy_reward/std": 0.17314550280570984, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.0833333320915699, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.06798820197582245, "step": 120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 892.25, "completions/max_terminated_length": 800.5, "completions/mean_length": 574.3125, "completions/mean_terminated_length": 553.1169738769531, "completions/min_length": 312.25, "completions/min_terminated_length": 312.25, "epoch": 0.04033333333333333, "grad_norm": 0.6567302346229553, "kl": 0.078125, "learning_rate": 8.000000000000001e-06, "loss": 0.1119, "num_tokens": 6033883.0, "reward": 0.9722656160593033, "reward_std": 0.07208464667201042, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.1632782220840454, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07966844737529755, "step": 121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.25, "completions/max_terminated_length": 739.25, "completions/mean_length": 541.765625, "completions/mean_terminated_length": 541.765625, "completions/min_length": 366.25, "completions/min_terminated_length": 366.25, "epoch": 0.04066666666666666, "grad_norm": 0.8157733678817749, "kl": 0.080810546875, "learning_rate": 8.066666666666667e-06, "loss": -0.0225, "num_tokens": 6079660.0, "reward": 1.0707407891750336, "reward_std": 0.38787663727998734, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.1632782220840454, "rewards/penalized_accuracy_reward/mean": 0.10589701682329178, "rewards/penalized_accuracy_reward/std": 0.33678513765335083, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.0624999962747097, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.06520001962780952, "step": 122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 963.5, "completions/max_terminated_length": 961.25, "completions/mean_length": 680.15625, "completions/mean_terminated_length": 674.4635467529297, "completions/min_length": 405.75, "completions/min_terminated_length": 405.75, "epoch": 0.041, "grad_norm": 0.7646713256835938, "kl": 0.06640625, "learning_rate": 8.133333333333334e-06, "loss": 0.0321, "num_tokens": 6133430.0, "reward": 0.9917968809604645, "reward_std": 0.19527434464544058, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.1875, "rewards/penalized_accuracy_reward/mean": 0.02734375, "rewards/penalized_accuracy_reward/std": 0.1093750074505806, "rewards/reasoning_steps_reward/mean": 0.9687500149011612, "rewards/reasoning_steps_reward/std": 0.11179708316922188, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.25, "completions/max_terminated_length": 862.25, "completions/mean_length": 613.53125, "completions/mean_terminated_length": 613.53125, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.04133333333333333, "grad_norm": 0.5323629379272461, "kl": 0.0792236328125, "learning_rate": 8.2e-06, "loss": 0.0065, "num_tokens": 6182744.0, "reward": 1.3273025453090668, "reward_std": 0.2767297988757491, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.33771923184394836, "rewards/penalized_accuracy_reward/std": 0.2713307738304138, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.0833333320915699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 957.0, "completions/max_terminated_length": 904.25, "completions/mean_length": 646.84375, "completions/mean_terminated_length": 636.4083557128906, "completions/min_length": 406.5, "completions/min_terminated_length": 406.5, "epoch": 0.041666666666666664, "grad_norm": 0.6856974363327026, "kl": 0.06842041015625, "learning_rate": 8.266666666666667e-06, "loss": 0.0573, "num_tokens": 6234014.0, "reward": 1.0164173543453217, "reward_std": 0.20691745728254318, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.037511106580495834, "rewards/penalized_accuracy_reward/std": 0.15004444122314453, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.045325469225645065, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.5, "completions/max_terminated_length": 883.5, "completions/mean_length": 606.40625, "completions/mean_terminated_length": 606.40625, "completions/min_length": 335.75, "completions/min_terminated_length": 335.75, "epoch": 0.042, "grad_norm": 0.5305747985839844, "kl": 0.0701904296875, "learning_rate": 8.333333333333334e-06, "loss": -0.0483, "num_tokens": 6281528.0, "reward": 0.9869791865348816, "reward_std": 0.04548186343163252, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.973958358168602, "rewards/reasoning_steps_reward/std": 0.09096374735236168, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.25, "completions/max_terminated_length": 846.25, "completions/mean_length": 568.875, "completions/mean_terminated_length": 568.875, "completions/min_length": 368.75, "completions/min_terminated_length": 368.75, "epoch": 0.042333333333333334, "grad_norm": 0.6627109050750732, "kl": 0.0797119140625, "learning_rate": 8.400000000000001e-06, "loss": 0.0405, "num_tokens": 6326704.0, "reward": 0.9777343720197678, "reward_std": 0.0601552352309227, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.96875, "rewards/reasoning_steps_reward/std": 0.06718547642230988, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.25, "completions/max_terminated_length": 594.25, "completions/mean_length": 446.21875, "completions/mean_terminated_length": 446.21875, "completions/min_length": 289.5, "completions/min_terminated_length": 289.5, "epoch": 0.042666666666666665, "grad_norm": 0.861020028591156, "kl": 0.0860595703125, "learning_rate": 8.466666666666668e-06, "loss": -0.0615, "num_tokens": 6365326.0, "reward": 1.0522719621658325, "reward_std": 0.27669200487434864, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09133441746234894, "rewards/penalized_accuracy_reward/std": 0.1965949833393097, "rewards/reasoning_steps_reward/mean": 0.9218750149011612, "rewards/reasoning_steps_reward/std": 0.19525551423430443, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 472.890625, "completions/mean_terminated_length": 472.890625, "completions/min_length": 295.25, "completions/min_terminated_length": 295.25, "epoch": 0.043, "grad_norm": 0.8932364583015442, "kl": 0.0789794921875, "learning_rate": 8.533333333333335e-06, "loss": -0.0626, "num_tokens": 6404775.0, "reward": 0.9579332917928696, "reward_std": 0.20159808173775673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.030849946662783623, "rewards/penalized_accuracy_reward/std": 0.12339979410171509, "rewards/reasoning_steps_reward/mean": 0.8541666865348816, "rewards/reasoning_steps_reward/std": 0.1918465420603752, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 564.015625, "completions/mean_terminated_length": 564.015625, "completions/min_length": 355.75, "completions/min_terminated_length": 355.75, "epoch": 0.043333333333333335, "grad_norm": 0.5215252637863159, "kl": 0.0770263671875, "learning_rate": 8.6e-06, "loss": 0.011, "num_tokens": 6450040.0, "reward": 1.2660482078790665, "reward_std": 0.4907595328986645, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2738606929779053, "rewards/penalized_accuracy_reward/std": 0.47496628761291504, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.04929708316922188, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.75, "completions/max_terminated_length": 862.75, "completions/mean_length": 583.265625, "completions/mean_terminated_length": 583.265625, "completions/min_length": 387.75, "completions/min_terminated_length": 387.75, "epoch": 0.043666666666666666, "grad_norm": 0.5661642551422119, "kl": 0.06927490234375, "learning_rate": 8.666666666666668e-06, "loss": -0.044, "num_tokens": 6495593.0, "reward": 0.9751302152872086, "reward_std": 0.06074373424053192, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9635416865348816, "rewards/reasoning_steps_reward/std": 0.08656488358974457, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 865.0, "completions/max_terminated_length": 796.25, "completions/mean_length": 612.59375, "completions/mean_terminated_length": 601.3906402587891, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.044, "grad_norm": 0.7252206802368164, "kl": 0.07562255859375, "learning_rate": 8.733333333333333e-06, "loss": 0.0371, "num_tokens": 6544847.0, "reward": 0.9873867779970169, "reward_std": 0.23923740535974503, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.14789126068353653, "rewards/penalized_accuracy_reward/mean": 0.035303402692079544, "rewards/penalized_accuracy_reward/std": 0.14121361076831818, "rewards/reasoning_steps_reward/mean": 0.9479167014360428, "rewards/reasoning_steps_reward/std": 0.14964327588677406, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09120866656303406, "step": 132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.75, "completions/max_terminated_length": 813.75, "completions/mean_length": 564.140625, "completions/mean_terminated_length": 564.140625, "completions/min_length": 386.5, "completions/min_terminated_length": 386.5, "epoch": 0.044333333333333336, "grad_norm": 0.6854252219200134, "kl": 0.07403564453125, "learning_rate": 8.8e-06, "loss": -0.0208, "num_tokens": 6590808.0, "reward": 0.9540364742279053, "reward_std": 0.0992764113470912, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.14789126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9479166865348816, "rewards/reasoning_steps_reward/std": 0.12311986833810806, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.03697281517088413, "step": 133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 880.0, "completions/max_terminated_length": 772.25, "completions/mean_length": 629.671875, "completions/mean_terminated_length": 604.9509124755859, "completions/min_length": 390.25, "completions/min_terminated_length": 390.25, "epoch": 0.04466666666666667, "grad_norm": 0.4922334849834442, "kl": 0.0748291015625, "learning_rate": 8.866666666666668e-06, "loss": 0.0967, "num_tokens": 6642387.0, "reward": 0.9710937589406967, "reward_std": 0.07905462384223938, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.17078252136707306, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11245574057102203, "step": 134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.5, "completions/max_terminated_length": 744.5, "completions/mean_length": 534.625, "completions/mean_terminated_length": 534.625, "completions/min_length": 347.25, "completions/min_terminated_length": 347.25, "epoch": 0.045, "grad_norm": 0.5973943471908569, "kl": 0.06695556640625, "learning_rate": 8.933333333333333e-06, "loss": -0.0403, "num_tokens": 6688251.0, "reward": 1.1759473234415054, "reward_std": 0.27473309077322483, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.1907910406589508, "rewards/penalized_accuracy_reward/std": 0.22456614673137665, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.0624999962747097, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.25, "completions/max_terminated_length": 871.25, "completions/mean_length": 545.59375, "completions/mean_terminated_length": 545.59375, "completions/min_length": 346.75, "completions/min_terminated_length": 346.75, "epoch": 0.04533333333333334, "grad_norm": 0.7356082201004028, "kl": 0.06689453125, "learning_rate": 9e-06, "loss": 0.0034, "num_tokens": 6731681.0, "reward": 0.9843750298023224, "reward_std": 0.06249998975545168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9687500149011612, "rewards/reasoning_steps_reward/std": 0.1249999962747097, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.25, "completions/max_terminated_length": 789.25, "completions/mean_length": 548.453125, "completions/mean_terminated_length": 548.453125, "completions/min_length": 353.5, "completions/min_terminated_length": 353.5, "epoch": 0.04566666666666667, "grad_norm": 0.5256035327911377, "kl": 0.0791015625, "learning_rate": 9.066666666666667e-06, "loss": -0.0046, "num_tokens": 6777742.0, "reward": 1.1897451877593994, "reward_std": 0.28156070224940777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.20016179978847504, "rewards/penalized_accuracy_reward/std": 0.26698175072669983, "rewards/reasoning_steps_reward/mean": 0.9791666865348816, "rewards/reasoning_steps_reward/std": 0.0833333283662796, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 826.75, "completions/max_terminated_length": 797.5, "completions/mean_length": 540.46875, "completions/mean_terminated_length": 534.0687561035156, "completions/min_length": 384.75, "completions/min_terminated_length": 384.75, "epoch": 0.046, "grad_norm": 0.6895825266838074, "kl": 0.072998046875, "learning_rate": 9.133333333333335e-06, "loss": 0.0353, "num_tokens": 6824652.0, "reward": 1.023446962237358, "reward_std": 0.2008255310356617, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.03790007159113884, "rewards/penalized_accuracy_reward/std": 0.15160028636455536, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.045325469225645065, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.25, "completions/max_terminated_length": 903.25, "completions/mean_length": 568.640625, "completions/mean_terminated_length": 568.640625, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.04633333333333333, "grad_norm": 0.7145423293113708, "kl": 0.0782470703125, "learning_rate": 9.200000000000002e-06, "loss": -0.0324, "num_tokens": 6872709.0, "reward": 1.254620909690857, "reward_std": 0.4884070521220565, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.26503754034638405, "rewards/penalized_accuracy_reward/std": 0.455327644944191, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.06615880131721497, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 831.75, "completions/max_terminated_length": 798.25, "completions/mean_length": 577.6875, "completions/mean_terminated_length": 572.1739654541016, "completions/min_length": 370.25, "completions/min_terminated_length": 370.25, "epoch": 0.04666666666666667, "grad_norm": 0.4422559142112732, "kl": 0.0770263671875, "learning_rate": 9.266666666666667e-06, "loss": 0.0223, "num_tokens": 6917809.0, "reward": 1.1303823590278625, "reward_std": 0.2677050596103072, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.14040839672088623, "rewards/penalized_accuracy_reward/std": 0.2511829137802124, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 825.5, "completions/max_terminated_length": 783.0, "completions/mean_length": 578.609375, "completions/mean_terminated_length": 572.3770904541016, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.047, "grad_norm": 0.41671282052993774, "kl": 0.0897216796875, "learning_rate": 9.333333333333334e-06, "loss": -0.0063, "num_tokens": 6962376.0, "reward": 1.0969283878803253, "reward_std": 0.33608537912368774, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.10356900468468666, "rewards/penalized_accuracy_reward/std": 0.3315762132406235, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.75, "completions/max_terminated_length": 709.75, "completions/mean_length": 574.21875, "completions/mean_terminated_length": 574.21875, "completions/min_length": 432.25, "completions/min_terminated_length": 432.25, "epoch": 0.04733333333333333, "grad_norm": 0.5335647463798523, "kl": 0.08154296875, "learning_rate": 9.4e-06, "loss": 0.0193, "num_tokens": 7009670.0, "reward": 1.3210109174251556, "reward_std": 0.4340841621160507, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.32882341742515564, "rewards/penalized_accuracy_reward/std": 0.43917667865753174, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 937.5, "completions/max_terminated_length": 927.75, "completions/mean_length": 658.375, "completions/mean_terminated_length": 642.2320098876953, "completions/min_length": 405.25, "completions/min_terminated_length": 405.25, "epoch": 0.04766666666666667, "grad_norm": 0.7152761220932007, "kl": 0.08819580078125, "learning_rate": 9.466666666666667e-06, "loss": 0.0784, "num_tokens": 7063806.0, "reward": 1.112352579832077, "reward_std": 0.4564479161053896, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.21039126068353653, "rewards/penalized_accuracy_reward/mean": 0.14047756046056747, "rewards/penalized_accuracy_reward/std": 0.3838609904050827, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11091229319572449, "step": 143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.5, "completions/max_terminated_length": 804.5, "completions/mean_length": 592.171875, "completions/mean_terminated_length": 592.171875, "completions/min_length": 403.75, "completions/min_terminated_length": 403.75, "epoch": 0.048, "grad_norm": 0.42576244473457336, "kl": 0.0836181640625, "learning_rate": 9.533333333333334e-06, "loss": -0.007, "num_tokens": 7112569.0, "reward": 1.0555336475372314, "reward_std": 0.17897546291351318, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.06334614008665085, "rewards/penalized_accuracy_reward/std": 0.17320473492145538, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 916.5, "completions/max_terminated_length": 843.25, "completions/mean_length": 615.765625, "completions/mean_terminated_length": 609.4593811035156, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.04833333333333333, "grad_norm": 0.7763967514038086, "kl": 0.085205078125, "learning_rate": 9.600000000000001e-06, "loss": 0.0302, "num_tokens": 7162346.0, "reward": 1.097764641046524, "reward_std": 0.3960863724350929, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.1875, "rewards/penalized_accuracy_reward/mean": 0.12289484962821007, "rewards/penalized_accuracy_reward/std": 0.3364574760198593, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 866.25, "completions/max_terminated_length": 848.0, "completions/mean_length": 651.59375, "completions/mean_terminated_length": 629.2216033935547, "completions/min_length": 447.75, "completions/min_terminated_length": 447.75, "epoch": 0.048666666666666664, "grad_norm": 0.5924893617630005, "kl": 0.0850830078125, "learning_rate": 9.666666666666667e-06, "loss": 0.063, "num_tokens": 7213984.0, "reward": 1.452492356300354, "reward_std": 0.6358413472771645, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.1875, "rewards/penalized_accuracy_reward/mean": 0.500148594379425, "rewards/penalized_accuracy_reward/std": 0.5861188173294067, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.06404344737529755, "step": 146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 646.75, "completions/max_terminated_length": 626.5, "completions/mean_length": 452.921875, "completions/mean_terminated_length": 441.7901916503906, "completions/min_length": 266.25, "completions/min_terminated_length": 266.25, "epoch": 0.049, "grad_norm": 0.8521579504013062, "kl": 0.09228515625, "learning_rate": 9.733333333333334e-06, "loss": -0.0441, "num_tokens": 7251755.0, "reward": 1.3368138670921326, "reward_std": 0.15262084361165762, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.2596946656703949, "rewards/penalized_accuracy_reward/mean": 0.4003555178642273, "rewards/penalized_accuracy_reward/std": 0.04466142877936363, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.0833333320915699, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.21579129993915558, "step": 147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 484.296875, "completions/mean_terminated_length": 484.296875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.04933333333333333, "grad_norm": 0.8887056708335876, "kl": 0.096435546875, "learning_rate": 9.800000000000001e-06, "loss": -0.0271, "num_tokens": 7293534.0, "reward": 1.0694793164730072, "reward_std": 0.40229372307658195, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.4022643193602562, "rewards/penalized_accuracy_reward/mean": 0.16635430604219437, "rewards/penalized_accuracy_reward/std": 0.3324264883995056, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.3176925200968981, "step": 148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 859.5, "completions/max_terminated_length": 857.0, "completions/mean_length": 633.71875, "completions/mean_terminated_length": 630.0052185058594, "completions/min_length": 394.5, "completions/min_terminated_length": 394.5, "epoch": 0.049666666666666665, "grad_norm": 0.6912094354629517, "kl": 0.0787353515625, "learning_rate": 9.866666666666668e-06, "loss": 0.0167, "num_tokens": 7343500.0, "reward": 1.2271893173456192, "reward_std": 0.37537867948412895, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.24463719688355923, "rewards/penalized_accuracy_reward/std": 0.33784525841474533, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.06454972177743912, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 789.25, "completions/max_terminated_length": 743.25, "completions/mean_length": 593.515625, "completions/mean_terminated_length": 568.71875, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.05, "grad_norm": 0.7075642347335815, "kl": 0.091064453125, "learning_rate": 9.933333333333334e-06, "loss": 0.0403, "num_tokens": 7394333.0, "reward": 0.9887036979198456, "reward_std": 0.21000467520207167, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.17430340498685837, "rewards/penalized_accuracy_reward/mean": 0.028807848691940308, "rewards/penalized_accuracy_reward/std": 0.11523139476776123, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.14635255187749863, "step": 150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 751.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 535.09375, "completions/mean_terminated_length": 528.0125122070312, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.050333333333333334, "grad_norm": 0.4760342538356781, "kl": 0.0927734375, "learning_rate": 1e-05, "loss": -0.0331, "num_tokens": 7438003.0, "reward": 0.9880945086479187, "reward_std": 0.20418935269117355, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.1971946656703949, "rewards/penalized_accuracy_reward/mean": 0.03457889333367348, "rewards/penalized_accuracy_reward/std": 0.1383155733346939, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.18738707154989243, "step": 151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.5, "completions/max_terminated_length": 794.5, "completions/mean_length": 529.515625, "completions/mean_terminated_length": 529.515625, "completions/min_length": 351.5, "completions/min_terminated_length": 351.5, "epoch": 0.050666666666666665, "grad_norm": 0.6448789834976196, "kl": 0.101806640625, "learning_rate": 1.0066666666666666e-05, "loss": -0.0228, "num_tokens": 7481876.0, "reward": 0.9598958492279053, "reward_std": 0.10131500661373138, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.1875, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.9635416716337204, "rewards/reasoning_steps_reward/std": 0.11148427054286003, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.125, "step": 152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 542.40625, "completions/mean_terminated_length": 542.40625, "completions/min_length": 291.25, "completions/min_terminated_length": 291.25, "epoch": 0.051, "grad_norm": 0.6065596342086792, "kl": 0.10888671875, "learning_rate": 1.0133333333333335e-05, "loss": 0.0136, "num_tokens": 7529262.0, "reward": 1.0723292827606201, "reward_std": 0.26749104261398315, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.10357928276062012, "rewards/penalized_accuracy_reward/std": 0.2227180302143097, "rewards/reasoning_steps_reward/mean": 0.96875, "rewards/reasoning_steps_reward/std": 0.07375510036945343, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.125, "step": 153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.5, "completions/max_terminated_length": 819.5, "completions/mean_length": 581.640625, "completions/mean_terminated_length": 581.640625, "completions/min_length": 388.25, "completions/min_terminated_length": 388.25, "epoch": 0.051333333333333335, "grad_norm": 0.7174891829490662, "kl": 0.0872802734375, "learning_rate": 1.02e-05, "loss": 0.0025, "num_tokens": 7576631.0, "reward": 1.0558475106954575, "reward_std": 0.27014252822846174, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.07329539954662323, "rewards/penalized_accuracy_reward/std": 0.20035086572170258, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09375, "step": 154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.25, "completions/max_terminated_length": 809.25, "completions/mean_length": 554.03125, "completions/mean_terminated_length": 554.03125, "completions/min_length": 333.75, "completions/min_terminated_length": 333.75, "epoch": 0.051666666666666666, "grad_norm": 0.5973610281944275, "kl": 0.0762939453125, "learning_rate": 1.0266666666666668e-05, "loss": 0.0161, "num_tokens": 7621753.0, "reward": 0.962109386920929, "reward_std": 0.04935498908162117, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9375000149011612, "rewards/reasoning_steps_reward/std": 0.08836335688829422, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 557.0625, "completions/mean_terminated_length": 557.0625, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.052, "grad_norm": 0.5781278014183044, "kl": 0.09228515625, "learning_rate": 1.0333333333333335e-05, "loss": -0.0361, "num_tokens": 7667229.0, "reward": 0.9921875298023224, "reward_std": 0.03124998975545168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.0624999962747097, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 535.0, "completions/mean_terminated_length": 535.0, "completions/min_length": 312.5, "completions/min_terminated_length": 312.5, "epoch": 0.052333333333333336, "grad_norm": 0.08791633695363998, "kl": 0.1031494140625, "learning_rate": 1.04e-05, "loss": 0.0041, "num_tokens": 7711901.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.75, "completions/max_terminated_length": 888.75, "completions/mean_length": 569.859375, "completions/mean_terminated_length": 569.859375, "completions/min_length": 321.5, "completions/min_terminated_length": 321.5, "epoch": 0.05266666666666667, "grad_norm": 0.2468576729297638, "kl": 0.1019287109375, "learning_rate": 1.0466666666666668e-05, "loss": -0.0474, "num_tokens": 7759348.0, "reward": 0.9973958432674408, "reward_std": 0.010416663251817226, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 850.75, "completions/max_terminated_length": 827.0, "completions/mean_length": 564.75, "completions/mean_terminated_length": 560.0031280517578, "completions/min_length": 386.25, "completions/min_terminated_length": 386.25, "epoch": 0.053, "grad_norm": 0.7196468710899353, "kl": 0.0997314453125, "learning_rate": 1.0533333333333333e-05, "loss": 0.0337, "num_tokens": 7804964.0, "reward": 1.1889299154281616, "reward_std": 0.41827017441391945, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.21640385873615742, "rewards/penalized_accuracy_reward/std": 0.3672218695282936, "rewards/reasoning_steps_reward/mean": 0.9583333432674408, "rewards/reasoning_steps_reward/std": 0.10400499776005745, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 912.25, "completions/max_terminated_length": 889.25, "completions/mean_length": 659.796875, "completions/mean_terminated_length": 649.8364715576172, "completions/min_length": 370.75, "completions/min_terminated_length": 370.75, "epoch": 0.05333333333333334, "grad_norm": 0.7643465995788574, "kl": 0.0953369140625, "learning_rate": 1.0600000000000002e-05, "loss": 0.0674, "num_tokens": 7857383.0, "reward": 1.0303755104541779, "reward_std": 0.2476533642038703, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.1875, "rewards/penalized_accuracy_reward/mean": 0.06331821531057358, "rewards/penalized_accuracy_reward/std": 0.17318597435951233, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.08538305386900902, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 922.5, "completions/max_terminated_length": 915.25, "completions/mean_length": 688.265625, "completions/mean_terminated_length": 680.2254486083984, "completions/min_length": 373.75, "completions/min_terminated_length": 373.75, "epoch": 0.05366666666666667, "grad_norm": 0.6083821654319763, "kl": 0.08447265625, "learning_rate": 1.0666666666666667e-05, "loss": 0.0299, "num_tokens": 7913800.0, "reward": 1.0095978379249573, "reward_std": 0.12780765816569328, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.08539126068353653, "rewards/penalized_accuracy_reward/mean": 0.022879095748066902, "rewards/penalized_accuracy_reward/std": 0.09151638299226761, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.021347815170884132, "step": 161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 934.0, "completions/max_terminated_length": 927.75, "completions/mean_length": 665.5625, "completions/mean_terminated_length": 660.6781311035156, "completions/min_length": 413.5, "completions/min_terminated_length": 413.5, "epoch": 0.054, "grad_norm": 0.498530775308609, "kl": 0.084228515625, "learning_rate": 1.0733333333333333e-05, "loss": 0.0739, "num_tokens": 7967820.0, "reward": 1.0622639656066895, "reward_std": 0.3131758403033018, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0734618678689003, "rewards/penalized_accuracy_reward/std": 0.2938474714756012, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.06404344737529755, "step": 162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 778.5, "completions/max_terminated_length": 777.75, "completions/mean_length": 593.96875, "completions/mean_terminated_length": 578.9427185058594, "completions/min_length": 289.5, "completions/min_terminated_length": 289.5, "epoch": 0.05433333333333333, "grad_norm": 0.9088695645332336, "kl": 0.102783203125, "learning_rate": 1.0800000000000002e-05, "loss": -0.0051, "num_tokens": 8018394.0, "reward": 1.031543791294098, "reward_std": 0.33813750743865967, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.11180340498685837, "rewards/penalized_accuracy_reward/mean": 0.09391359053552151, "rewards/penalized_accuracy_reward/std": 0.27939801663160324, "rewards/reasoning_steps_reward/mean": 0.9322916865348816, "rewards/reasoning_steps_reward/std": 0.12620654702186584, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.08017472177743912, "step": 163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 629.40625, "completions/mean_terminated_length": 629.40625, "completions/min_length": 357.25, "completions/min_terminated_length": 357.25, "epoch": 0.05466666666666667, "grad_norm": 0.683199942111969, "kl": 0.095947265625, "learning_rate": 1.0866666666666667e-05, "loss": -0.0333, "num_tokens": 8068452.0, "reward": 0.9789062589406967, "reward_std": 0.08437499310821295, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.0624999962747097, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.75, "completions/max_terminated_length": 783.75, "completions/mean_length": 538.671875, "completions/mean_terminated_length": 538.671875, "completions/min_length": 324.75, "completions/min_terminated_length": 324.75, "epoch": 0.055, "grad_norm": 0.7674322128295898, "kl": 0.096435546875, "learning_rate": 1.0933333333333334e-05, "loss": 0.0147, "num_tokens": 8111759.0, "reward": 1.2586846053600311, "reward_std": 0.4014707673341036, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.08539126068353653, "rewards/penalized_accuracy_reward/mean": 0.31063770316541195, "rewards/penalized_accuracy_reward/std": 0.3363153263926506, "rewards/reasoning_steps_reward/mean": 0.9218750149011612, "rewards/reasoning_steps_reward/std": 0.14865445718169212, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 911.25, "completions/max_terminated_length": 872.5, "completions/mean_length": 625.640625, "completions/mean_terminated_length": 619.5031280517578, "completions/min_length": 316.75, "completions/min_terminated_length": 316.75, "epoch": 0.05533333333333333, "grad_norm": 0.6314289569854736, "kl": 0.1151123046875, "learning_rate": 1.1000000000000001e-05, "loss": -0.0063, "num_tokens": 8161384.0, "reward": 0.9713541865348816, "reward_std": 0.06449455861002207, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.08539126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.08699213340878487, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11027991026639938, "step": 166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.25, "completions/max_terminated_length": 760.25, "completions/mean_length": 594.03125, "completions/mean_terminated_length": 594.03125, "completions/min_length": 415.75, "completions/min_terminated_length": 415.75, "epoch": 0.05566666666666667, "grad_norm": 0.7111170291900635, "kl": 0.1248779296875, "learning_rate": 1.1066666666666669e-05, "loss": -0.0209, "num_tokens": 8207802.0, "reward": 1.0302852392196655, "reward_std": 0.16280756704509258, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.035493556410074234, "rewards/penalized_accuracy_reward/std": 0.14197422564029694, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 898.5, "completions/max_terminated_length": 819.5, "completions/mean_length": 615.109375, "completions/mean_terminated_length": 602.5218963623047, "completions/min_length": 367.75, "completions/min_terminated_length": 367.75, "epoch": 0.056, "grad_norm": 0.48290061950683594, "kl": 0.0958251953125, "learning_rate": 1.1133333333333334e-05, "loss": 0.0452, "num_tokens": 8259409.0, "reward": 1.0043448507785797, "reward_std": 0.1297731138765812, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.023615676909685135, "rewards/penalized_accuracy_reward/std": 0.09446270018815994, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.028463751077651978, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 981.25, "completions/max_terminated_length": 952.0, "completions/mean_length": 640.203125, "completions/mean_terminated_length": 626.9486694335938, "completions/min_length": 365.5, "completions/min_terminated_length": 365.5, "epoch": 0.05633333333333333, "grad_norm": 0.6626110076904297, "kl": 0.091552734375, "learning_rate": 1.1200000000000001e-05, "loss": 0.0242, "num_tokens": 8312334.0, "reward": 0.977734386920929, "reward_std": 0.06429007556289434, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.08539126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.04929708316922188, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06340491026639938, "step": 169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 884.0, "completions/max_terminated_length": 873.75, "completions/mean_length": 599.890625, "completions/mean_terminated_length": 593.7854309082031, "completions/min_length": 362.75, "completions/min_terminated_length": 362.75, "epoch": 0.056666666666666664, "grad_norm": 0.8113217353820801, "kl": 0.1051025390625, "learning_rate": 1.1266666666666668e-05, "loss": 0.0321, "num_tokens": 8358455.0, "reward": 1.091463789343834, "reward_std": 0.2897885050624609, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.13078668527305126, "rewards/penalized_accuracy_reward/std": 0.23328303545713425, "rewards/reasoning_steps_reward/mean": 0.9479166865348816, "rewards/reasoning_steps_reward/std": 0.11515219509601593, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 816.0, "completions/max_terminated_length": 781.25, "completions/mean_length": 544.46875, "completions/mean_terminated_length": 538.4937591552734, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.057, "grad_norm": 0.773130476474762, "kl": 0.1029052734375, "learning_rate": 1.1333333333333334e-05, "loss": 0.0359, "num_tokens": 8403141.0, "reward": 1.2276104539632797, "reward_std": 0.38097991049289703, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.2628968879580498, "rewards/penalized_accuracy_reward/std": 0.33269084990024567, "rewards/reasoning_steps_reward/mean": 0.942708358168602, "rewards/reasoning_steps_reward/std": 0.15272368490695953, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.25, "completions/max_terminated_length": 769.25, "completions/mean_length": 547.65625, "completions/mean_terminated_length": 547.65625, "completions/min_length": 356.75, "completions/min_terminated_length": 356.75, "epoch": 0.05733333333333333, "grad_norm": 0.6168924570083618, "kl": 0.1142578125, "learning_rate": 1.14e-05, "loss": -0.0371, "num_tokens": 8448191.0, "reward": 0.9587239623069763, "reward_std": 0.09781504608690739, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9322916716337204, "rewards/reasoning_steps_reward/std": 0.13625510036945343, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.25, "completions/max_terminated_length": 756.25, "completions/mean_length": 538.265625, "completions/mean_terminated_length": 538.265625, "completions/min_length": 314.75, "completions/min_terminated_length": 314.75, "epoch": 0.057666666666666665, "grad_norm": 0.28591179847717285, "kl": 0.106689453125, "learning_rate": 1.1466666666666668e-05, "loss": -0.0214, "num_tokens": 8492928.0, "reward": 0.984375, "reward_std": 0.029949801042675972, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.96875, "rewards/reasoning_steps_reward/std": 0.05989960581064224, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 600.078125, "completions/mean_terminated_length": 600.078125, "completions/min_length": 428.5, "completions/min_terminated_length": 428.5, "epoch": 0.058, "grad_norm": 0.6120237112045288, "kl": 0.1251220703125, "learning_rate": 1.1533333333333334e-05, "loss": 0.0012, "num_tokens": 8542149.0, "reward": 1.0353666841983795, "reward_std": 0.1445917427772656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.035757310688495636, "rewards/penalized_accuracy_reward/std": 0.14302925765514374, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 546.40625, "completions/mean_terminated_length": 546.40625, "completions/min_length": 326.5, "completions/min_terminated_length": 326.5, "epoch": 0.058333333333333334, "grad_norm": 0.8150026798248291, "kl": 0.107177734375, "learning_rate": 1.16e-05, "loss": 0.0531, "num_tokens": 8586303.0, "reward": 0.9785156399011612, "reward_std": 0.061021566041745245, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.045325469225645065, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 939.5, "completions/max_terminated_length": 900.25, "completions/mean_length": 603.34375, "completions/mean_terminated_length": 596.8812561035156, "completions/min_length": 323.75, "completions/min_terminated_length": 323.75, "epoch": 0.058666666666666666, "grad_norm": 0.5437741875648499, "kl": 0.104736328125, "learning_rate": 1.1666666666666668e-05, "loss": 0.0275, "num_tokens": 8636565.0, "reward": 1.141858160495758, "reward_std": 0.30250774696469307, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.15188416838645935, "rewards/penalized_accuracy_reward/std": 0.27170804142951965, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.75, "completions/max_terminated_length": 859.75, "completions/mean_length": 575.453125, "completions/mean_terminated_length": 575.453125, "completions/min_length": 334.25, "completions/min_terminated_length": 334.25, "epoch": 0.059, "grad_norm": 0.7538095116615295, "kl": 0.107421875, "learning_rate": 1.1733333333333335e-05, "loss": -0.0034, "num_tokens": 8680978.0, "reward": 1.022205427289009, "reward_std": 0.15210942446719855, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.031189795583486557, "rewards/penalized_accuracy_reward/std": 0.12475918233394623, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.045325469225645065, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 577.6875, "completions/mean_terminated_length": 577.6875, "completions/min_length": 342.5, "completions/min_terminated_length": 342.5, "epoch": 0.059333333333333335, "grad_norm": 0.43398621678352356, "kl": 0.126708984375, "learning_rate": 1.18e-05, "loss": -0.0072, "num_tokens": 8726478.0, "reward": 0.9947916716337204, "reward_std": 0.01423187181353569, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.028463751077651978, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 982.0, "completions/max_terminated_length": 876.5, "completions/mean_length": 672.703125, "completions/mean_terminated_length": 656.7687835693359, "completions/min_length": 471.25, "completions/min_terminated_length": 471.25, "epoch": 0.059666666666666666, "grad_norm": 0.6989478468894958, "kl": 0.08056640625, "learning_rate": 1.186666666666667e-05, "loss": 0.0736, "num_tokens": 8781163.0, "reward": 1.0119648873806, "reward_std": 0.20964388456195593, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.1875, "rewards/penalized_accuracy_reward/mean": 0.03527218475937843, "rewards/penalized_accuracy_reward/std": 0.14108875393867493, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 947.75, "completions/max_terminated_length": 932.25, "completions/mean_length": 661.421875, "completions/mean_terminated_length": 640.4619293212891, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.06, "grad_norm": 0.6695852279663086, "kl": 0.0841064453125, "learning_rate": 1.1933333333333335e-05, "loss": 0.0695, "num_tokens": 8836006.0, "reward": 0.963020846247673, "reward_std": 0.10263045411556959, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.21039126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9791666865348816, "rewards/reasoning_steps_reward/std": 0.07013041526079178, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.05259781517088413, "step": 180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.75, "completions/max_terminated_length": 914.75, "completions/mean_length": 618.546875, "completions/mean_terminated_length": 618.546875, "completions/min_length": 387.5, "completions/min_terminated_length": 387.5, "epoch": 0.060333333333333336, "grad_norm": 0.6475189924240112, "kl": 0.1094970703125, "learning_rate": 1.2e-05, "loss": -0.0183, "num_tokens": 8887369.0, "reward": 1.093987837433815, "reward_std": 0.3499455749988556, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.10440447181463242, "rewards/penalized_accuracy_reward/std": 0.3267183154821396, "rewards/reasoning_steps_reward/mean": 0.9791666865348816, "rewards/reasoning_steps_reward/std": 0.0833333283662796, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.25, "completions/max_terminated_length": 635.25, "completions/mean_length": 464.8125, "completions/mean_terminated_length": 464.8125, "completions/min_length": 333.5, "completions/min_terminated_length": 333.5, "epoch": 0.06066666666666667, "grad_norm": 0.7688916325569153, "kl": 0.1220703125, "learning_rate": 1.206666666666667e-05, "loss": -0.0231, "num_tokens": 8927805.0, "reward": 0.9999911040067673, "reward_std": 0.1609197175130248, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.023428570479154587, "rewards/penalized_accuracy_reward/std": 0.09371428191661835, "rewards/reasoning_steps_reward/mean": 0.9531250149011612, "rewards/reasoning_steps_reward/std": 0.17032546550035477, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.5, "completions/max_terminated_length": 737.5, "completions/mean_length": 523.21875, "completions/mean_terminated_length": 523.21875, "completions/min_length": 291.5, "completions/min_terminated_length": 291.5, "epoch": 0.061, "grad_norm": 0.7014211416244507, "kl": 0.1153564453125, "learning_rate": 1.2133333333333335e-05, "loss": -0.0261, "num_tokens": 8970027.0, "reward": 1.2516676783561707, "reward_std": 0.3363241720944643, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2725009620189667, "rewards/penalized_accuracy_reward/std": 0.2822096049785614, "rewards/reasoning_steps_reward/mean": 0.9583333432674408, "rewards/reasoning_steps_reward/std": 0.13775940239429474, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.5, "completions/max_terminated_length": 844.5, "completions/mean_length": 555.703125, "completions/mean_terminated_length": 555.703125, "completions/min_length": 349.75, "completions/min_terminated_length": 349.75, "epoch": 0.06133333333333333, "grad_norm": 0.5648781061172485, "kl": 0.08984375, "learning_rate": 1.22e-05, "loss": 0.0239, "num_tokens": 9017016.0, "reward": 1.2053382843732834, "reward_std": 0.3726501800119877, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.22760391235351562, "rewards/penalized_accuracy_reward/std": 0.34619054943323135, "rewards/reasoning_steps_reward/mean": 0.96875, "rewards/reasoning_steps_reward/std": 0.09065093845129013, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 825.75, "completions/max_terminated_length": 814.0, "completions/mean_length": 582.90625, "completions/mean_terminated_length": 579.5052185058594, "completions/min_length": 347.25, "completions/min_terminated_length": 347.25, "epoch": 0.06166666666666667, "grad_norm": 0.7277988195419312, "kl": 0.1080322265625, "learning_rate": 1.2266666666666667e-05, "loss": -0.0195, "num_tokens": 9062834.0, "reward": 1.0418638736009598, "reward_std": 0.2697664760053158, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.0689472034573555, "rewards/penalized_accuracy_reward/std": 0.18844179809093475, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.08699213340878487, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.75, "completions/max_terminated_length": 900.75, "completions/mean_length": 635.71875, "completions/mean_terminated_length": 635.71875, "completions/min_length": 447.5, "completions/min_terminated_length": 447.5, "epoch": 0.062, "grad_norm": 0.35816872119903564, "kl": 0.1043701171875, "learning_rate": 1.2333333333333334e-05, "loss": 0.014, "num_tokens": 9112192.0, "reward": 0.9973958432674408, "reward_std": 0.010416663251817226, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 969.0, "completions/max_terminated_length": 957.75, "completions/mean_length": 775.0625, "completions/mean_terminated_length": 769.0535736083984, "completions/min_length": 553.75, "completions/min_terminated_length": 553.75, "epoch": 0.06233333333333333, "grad_norm": 0.48311498761177063, "kl": 0.1060791015625, "learning_rate": 1.2400000000000002e-05, "loss": 0.0115, "num_tokens": 9172964.0, "reward": 0.9718749970197678, "reward_std": 0.07948593609035015, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.14789126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.04929708316922188, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.049619100987911224, "step": 187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 898.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 708.859375, "completions/mean_terminated_length": 704.3979187011719, "completions/min_length": 497.25, "completions/min_terminated_length": 497.25, "epoch": 0.06266666666666666, "grad_norm": 0.5689432621002197, "kl": 0.1102294921875, "learning_rate": 1.2466666666666667e-05, "loss": 0.0226, "num_tokens": 9228219.0, "reward": 1.0231836587190628, "reward_std": 0.16981789749115705, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.032819055020809174, "rewards/penalized_accuracy_reward/std": 0.1312762200832367, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1018.25, "completions/max_terminated_length": 991.0, "completions/mean_length": 846.1875, "completions/mean_terminated_length": 795.1122283935547, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.063, "grad_norm": 0.5394105911254883, "kl": 0.095703125, "learning_rate": 1.2533333333333336e-05, "loss": 0.0808, "num_tokens": 9292071.0, "reward": 0.9207031279802322, "reward_std": 0.12802047468721867, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.27699070423841476, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.18513670563697815, "step": 189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 981.5, "completions/max_terminated_length": 957.5, "completions/mean_length": 824.4375, "completions/mean_terminated_length": 787.9070129394531, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "epoch": 0.06333333333333334, "grad_norm": 0.6665021777153015, "kl": 0.0994873046875, "learning_rate": 1.2600000000000001e-05, "loss": 0.0531, "num_tokens": 9355107.0, "reward": 0.931629091501236, "reward_std": 0.2687137797474861, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.2675696536898613, "rewards/penalized_accuracy_reward/mean": 0.0352749302983284, "rewards/penalized_accuracy_reward/std": 0.1410997211933136, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.15779344737529755, "step": 190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.25, "completions/max_terminated_length": 899.25, "completions/mean_length": 663.390625, "completions/mean_terminated_length": 663.390625, "completions/min_length": 487.75, "completions/min_terminated_length": 487.75, "epoch": 0.06366666666666666, "grad_norm": 0.3484903872013092, "kl": 0.0921630859375, "learning_rate": 1.2666666666666667e-05, "loss": -0.004, "num_tokens": 9412332.0, "reward": 1.1529858112335205, "reward_std": 0.27366939187049866, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.15298579633235931, "rewards/penalized_accuracy_reward/std": 0.27366936206817627, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 987.75, "completions/max_terminated_length": 967.5, "completions/mean_length": 771.890625, "completions/mean_terminated_length": 761.0098266601562, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "epoch": 0.064, "grad_norm": 0.47288382053375244, "kl": 0.1241455078125, "learning_rate": 1.2733333333333336e-05, "loss": 0.0468, "num_tokens": 9473109.0, "reward": 0.9766927063465118, "reward_std": 0.0663717407733202, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.14789126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06403729319572449, "step": 192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.75, "completions/max_terminated_length": 884.75, "completions/mean_length": 631.03125, "completions/mean_terminated_length": 631.03125, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.06433333333333334, "grad_norm": 0.6051411032676697, "kl": 0.0982666015625, "learning_rate": 1.2800000000000001e-05, "loss": -0.0479, "num_tokens": 9522567.0, "reward": 1.0272425264120102, "reward_std": 0.23006188031286, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.053284166380763054, "rewards/penalized_accuracy_reward/std": 0.21313666552305222, "rewards/reasoning_steps_reward/mean": 0.9479166865348816, "rewards/reasoning_steps_reward/std": 0.14323293417692184, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.25, "completions/max_terminated_length": 893.25, "completions/mean_length": 694.5, "completions/mean_terminated_length": 694.5, "completions/min_length": 479.5, "completions/min_terminated_length": 479.5, "epoch": 0.06466666666666666, "grad_norm": 0.6984221339225769, "kl": 0.107421875, "learning_rate": 1.2866666666666667e-05, "loss": -0.0016, "num_tokens": 9580599.0, "reward": 1.1104334741830826, "reward_std": 0.30629251059144735, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1234542764723301, "rewards/penalized_accuracy_reward/std": 0.2720722556114197, "rewards/reasoning_steps_reward/mean": 0.973958358168602, "rewards/reasoning_steps_reward/std": 0.1041666604578495, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.25, "completions/max_terminated_length": 800.25, "completions/mean_length": 615.984375, "completions/mean_terminated_length": 615.984375, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.065, "grad_norm": 0.4620238244533539, "kl": 0.1046142578125, "learning_rate": 1.2933333333333334e-05, "loss": -0.0052, "num_tokens": 9628998.0, "reward": 0.9947916865348816, "reward_std": 0.020833326503634453, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 886.0, "completions/max_terminated_length": 833.75, "completions/mean_length": 615.359375, "completions/mean_terminated_length": 605.1760559082031, "completions/min_length": 430.25, "completions/min_terminated_length": 430.25, "epoch": 0.06533333333333333, "grad_norm": 0.5173254013061523, "kl": 0.116943359375, "learning_rate": 1.3000000000000001e-05, "loss": 0.0173, "num_tokens": 9678045.0, "reward": 0.9632812589406967, "reward_std": 0.08124083653092384, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9531250149011612, "rewards/reasoning_steps_reward/std": 0.1219017468392849, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.75, "completions/max_terminated_length": 746.75, "completions/mean_length": 528.8125, "completions/mean_terminated_length": 528.8125, "completions/min_length": 342.25, "completions/min_terminated_length": 342.25, "epoch": 0.06566666666666666, "grad_norm": 0.796053946018219, "kl": 0.1429443359375, "learning_rate": 1.3066666666666668e-05, "loss": -0.0489, "num_tokens": 9726577.0, "reward": 1.0946148484945297, "reward_std": 0.3945549316704273, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.11427633091807365, "rewards/penalized_accuracy_reward/std": 0.3603511452674866, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.0727677047252655, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.25, "completions/max_terminated_length": 629.25, "completions/mean_length": 489.96875, "completions/mean_terminated_length": 489.96875, "completions/min_length": 350.5, "completions/min_terminated_length": 350.5, "epoch": 0.066, "grad_norm": 0.688350260257721, "kl": 0.1165771484375, "learning_rate": 1.3133333333333334e-05, "loss": -0.0299, "num_tokens": 9768255.0, "reward": 1.218723639845848, "reward_std": 0.22623951733112335, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.24216115474700928, "rewards/penalized_accuracy_reward/std": 0.19652985036373138, "rewards/reasoning_steps_reward/mean": 0.9531250149011612, "rewards/reasoning_steps_reward/std": 0.11016901582479477, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.5, "completions/max_terminated_length": 645.5, "completions/mean_length": 484.90625, "completions/mean_terminated_length": 484.90625, "completions/min_length": 348.25, "completions/min_terminated_length": 348.25, "epoch": 0.06633333333333333, "grad_norm": 0.7640448212623596, "kl": 0.106689453125, "learning_rate": 1.3200000000000002e-05, "loss": -0.034, "num_tokens": 9808185.0, "reward": 1.189970687031746, "reward_std": 0.30377752613276243, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.22903314232826233, "rewards/penalized_accuracy_reward/std": 0.23654542863368988, "rewards/reasoning_steps_reward/mean": 0.9218750149011612, "rewards/reasoning_steps_reward/std": 0.20916644483804703, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 534.609375, "completions/mean_terminated_length": 534.609375, "completions/min_length": 399.5, "completions/min_terminated_length": 399.5, "epoch": 0.06666666666666667, "grad_norm": 0.7903880476951599, "kl": 0.1226806640625, "learning_rate": 1.3266666666666668e-05, "loss": 0.019, "num_tokens": 9852080.0, "reward": 1.1319045722484589, "reward_std": 0.28382984828203917, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16315454244613647, "rewards/penalized_accuracy_reward/std": 0.2500864267349243, "rewards/reasoning_steps_reward/mean": 0.9375000149011612, "rewards/reasoning_steps_reward/std": 0.15744590014219284, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 200 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.5, "completions/max_terminated_length": 801.5, "completions/mean_length": 615.65625, "completions/mean_terminated_length": 615.65625, "completions/min_length": 399.75, "completions/min_terminated_length": 399.75, "epoch": 0.067, "grad_norm": 0.6389729380607605, "kl": 0.121337890625, "learning_rate": 1.3333333333333333e-05, "loss": -0.0109, "num_tokens": 9899770.0, "reward": 1.1019178926944733, "reward_std": 0.2688593650236726, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11233451962471008, "rewards/penalized_accuracy_reward/std": 0.24164631962776184, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.05442607030272484, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.5, "completions/max_terminated_length": 857.5, "completions/mean_length": 635.640625, "completions/mean_terminated_length": 635.640625, "completions/min_length": 443.5, "completions/min_terminated_length": 443.5, "epoch": 0.06733333333333333, "grad_norm": 0.5509011745452881, "kl": 0.124755859375, "learning_rate": 1.3400000000000002e-05, "loss": 0.0041, "num_tokens": 9951955.0, "reward": 0.9807291626930237, "reward_std": 0.054120369255542755, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.08699213340878487, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.75, "completions/max_terminated_length": 738.75, "completions/mean_length": 535.328125, "completions/mean_terminated_length": 535.328125, "completions/min_length": 364.5, "completions/min_terminated_length": 364.5, "epoch": 0.06766666666666667, "grad_norm": 0.527125358581543, "kl": 0.154052734375, "learning_rate": 1.3466666666666668e-05, "loss": 0.012, "num_tokens": 10002312.0, "reward": 1.4380362629890442, "reward_std": 0.471890464425087, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.4484528675675392, "rewards/penalized_accuracy_reward/std": 0.45712582767009735, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.0833333320915699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.5, "completions/max_terminated_length": 901.5, "completions/mean_length": 578.421875, "completions/mean_terminated_length": 578.421875, "completions/min_length": 352.25, "completions/min_terminated_length": 352.25, "epoch": 0.068, "grad_norm": 0.48045891523361206, "kl": 0.127685546875, "learning_rate": 1.3533333333333333e-05, "loss": -0.0418, "num_tokens": 10048499.0, "reward": 1.0155680775642395, "reward_std": 0.1726234508678317, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03379720076918602, "rewards/penalized_accuracy_reward/std": 0.13518880307674408, "rewards/reasoning_steps_reward/mean": 0.9635416865348816, "rewards/reasoning_steps_reward/std": 0.11061252281069756, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 975.75, "completions/max_terminated_length": 966.5, "completions/mean_length": 748.6875, "completions/mean_terminated_length": 729.5687713623047, "completions/min_length": 463.25, "completions/min_terminated_length": 463.25, "epoch": 0.06833333333333333, "grad_norm": 0.6406579613685608, "kl": 0.1331787109375, "learning_rate": 1.3600000000000002e-05, "loss": 0.0482, "num_tokens": 10108911.0, "reward": 0.9862234890460968, "reward_std": 0.21676897443830967, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.1875, "rewards/penalized_accuracy_reward/mean": 0.03348911926150322, "rewards/penalized_accuracy_reward/std": 0.13395647704601288, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.078125, "step": 205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 913.25, "completions/max_terminated_length": 869.75, "completions/mean_length": 655.359375, "completions/mean_terminated_length": 646.7114715576172, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.06866666666666667, "grad_norm": 0.699122965335846, "kl": 0.1441650390625, "learning_rate": 1.3666666666666667e-05, "loss": -0.0187, "num_tokens": 10160166.0, "reward": 1.2349582314491272, "reward_std": 0.5138601027429104, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.25, "rewards/penalized_accuracy_reward/mean": 0.2742811441421509, "rewards/penalized_accuracy_reward/std": 0.4517088681459427, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.06454972177743912, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.15625, "step": 206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 989.5, "completions/max_terminated_length": 881.75, "completions/mean_length": 710.875, "completions/mean_terminated_length": 677.321891784668, "completions/min_length": 426.25, "completions/min_terminated_length": 426.25, "epoch": 0.069, "grad_norm": 0.678071141242981, "kl": 0.1116943359375, "learning_rate": 1.3733333333333335e-05, "loss": 0.0659, "num_tokens": 10216158.0, "reward": 0.9995495826005936, "reward_std": 0.30108168721199036, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.29398179799318314, "rewards/penalized_accuracy_reward/mean": 0.07025270164012909, "rewards/penalized_accuracy_reward/std": 0.19196806848049164, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.15412572026252747, "step": 207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.75, "completions/mean_length": 803.390625, "completions/mean_terminated_length": 779.1112823486328, "completions/min_length": 575.5, "completions/min_terminated_length": 575.5, "epoch": 0.06933333333333333, "grad_norm": 0.6706548929214478, "kl": 0.1131591796875, "learning_rate": 1.38e-05, "loss": 0.058, "num_tokens": 10279543.0, "reward": 1.1049699932336807, "reward_std": 0.3561432473361492, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.31116948276758194, "rewards/penalized_accuracy_reward/mean": 0.15562105178833008, "rewards/penalized_accuracy_reward/std": 0.23856909573078156, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.129237312823534, "step": 208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 955.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 654.59375, "completions/mean_terminated_length": 649.652099609375, "completions/min_length": 436.25, "completions/min_terminated_length": 436.25, "epoch": 0.06966666666666667, "grad_norm": 0.5483206510543823, "kl": 0.128173828125, "learning_rate": 1.3866666666666669e-05, "loss": 0.0133, "num_tokens": 10330861.0, "reward": 1.0247620195150375, "reward_std": 0.17300635110586882, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.03400679677724838, "rewards/penalized_accuracy_reward/std": 0.13602718710899353, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 922.75, "completions/max_terminated_length": 900.25, "completions/mean_length": 653.25, "completions/mean_terminated_length": 648.4000091552734, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.07, "grad_norm": 0.5754551291465759, "kl": 0.11962890625, "learning_rate": 1.3933333333333334e-05, "loss": 0.0279, "num_tokens": 10381597.0, "reward": 1.32256717979908, "reward_std": 0.33077580854296684, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.3414474129676819, "rewards/penalized_accuracy_reward/std": 0.311147004365921, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 210 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.75, "completions/max_terminated_length": 784.75, "completions/mean_length": 588.21875, "completions/mean_terminated_length": 588.21875, "completions/min_length": 399.5, "completions/min_terminated_length": 399.5, "epoch": 0.07033333333333333, "grad_norm": 0.3139912486076355, "kl": 0.1322021484375, "learning_rate": 1.4e-05, "loss": 0.0098, "num_tokens": 10429323.0, "reward": 0.9921875, "reward_std": 0.022662732750177383, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.045325469225645065, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.5, "completions/max_terminated_length": 847.5, "completions/mean_length": 609.25, "completions/mean_terminated_length": 609.25, "completions/min_length": 376.5, "completions/min_terminated_length": 376.5, "epoch": 0.07066666666666667, "grad_norm": 0.7940312027931213, "kl": 0.1170654296875, "learning_rate": 1.4066666666666669e-05, "loss": -0.0041, "num_tokens": 10481179.0, "reward": 1.0844275057315826, "reward_std": 0.3520566299557686, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.10109420865774155, "rewards/penalized_accuracy_reward/std": 0.319944828748703, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.049619100987911224, "step": 212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.25, "completions/max_terminated_length": 751.25, "completions/mean_length": 569.296875, "completions/mean_terminated_length": 569.296875, "completions/min_length": 402.5, "completions/min_terminated_length": 402.5, "epoch": 0.071, "grad_norm": 0.3268507421016693, "kl": 0.1513671875, "learning_rate": 1.4133333333333334e-05, "loss": 0.006, "num_tokens": 10529582.0, "reward": 0.9973958432674408, "reward_std": 0.010416663251817226, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 497.484375, "completions/mean_terminated_length": 497.484375, "completions/min_length": 295.25, "completions/min_terminated_length": 295.25, "epoch": 0.07133333333333333, "grad_norm": 0.8682353496551514, "kl": 0.1650390625, "learning_rate": 1.4200000000000001e-05, "loss": 0.033, "num_tokens": 10576925.0, "reward": 1.4779804199934006, "reward_std": 0.5776420421898365, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.4968606233596802, "rewards/penalized_accuracy_reward/std": 0.5501919239759445, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.028463751077651978, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 214 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.25, "completions/max_terminated_length": 707.25, "completions/mean_length": 487.40625, "completions/mean_terminated_length": 487.40625, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.07166666666666667, "grad_norm": 0.6823318600654602, "kl": 0.1435546875, "learning_rate": 1.4266666666666668e-05, "loss": -0.0286, "num_tokens": 10615719.0, "reward": 1.2218518257141113, "reward_std": 0.3399867806583643, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.24672159552574158, "rewards/penalized_accuracy_reward/std": 0.2893291413784027, "rewards/reasoning_steps_reward/mean": 0.9635416716337204, "rewards/reasoning_steps_reward/std": 0.12865880131721497, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.75, "completions/max_terminated_length": 803.75, "completions/mean_length": 544.71875, "completions/mean_terminated_length": 544.71875, "completions/min_length": 350.75, "completions/min_terminated_length": 350.75, "epoch": 0.072, "grad_norm": 0.5547329783439636, "kl": 0.165771484375, "learning_rate": 1.4333333333333334e-05, "loss": -0.0017, "num_tokens": 10659861.0, "reward": 1.2048795819282532, "reward_std": 0.3519400358200073, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.20487959496676922, "rewards/penalized_accuracy_reward/std": 0.35194002091884613, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 544.296875, "completions/mean_terminated_length": 544.296875, "completions/min_length": 341.75, "completions/min_terminated_length": 341.75, "epoch": 0.07233333333333333, "grad_norm": 0.8118423819541931, "kl": 0.131591796875, "learning_rate": 1.4400000000000001e-05, "loss": -0.0361, "num_tokens": 10703416.0, "reward": 1.0395066440105438, "reward_std": 0.3176156934350729, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.06698058173060417, "rewards/penalized_accuracy_reward/std": 0.2679223269224167, "rewards/reasoning_steps_reward/mean": 0.958333358168602, "rewards/reasoning_steps_reward/std": 0.09419529885053635, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 573.734375, "completions/mean_terminated_length": 573.734375, "completions/min_length": 383.25, "completions/min_terminated_length": 383.25, "epoch": 0.07266666666666667, "grad_norm": 0.3515082001686096, "kl": 0.1280517578125, "learning_rate": 1.4466666666666668e-05, "loss": 0.0054, "num_tokens": 10748999.0, "reward": 0.9921875, "reward_std": 0.03125, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 628.140625, "completions/mean_terminated_length": 628.140625, "completions/min_length": 400.5, "completions/min_terminated_length": 400.5, "epoch": 0.073, "grad_norm": 0.5115145444869995, "kl": 0.146728515625, "learning_rate": 1.4533333333333335e-05, "loss": -0.0195, "num_tokens": 10798048.0, "reward": 0.989583358168602, "reward_std": 0.03506519831717014, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9791666865348816, "rewards/reasoning_steps_reward/std": 0.07013041526079178, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 885.25, "completions/max_terminated_length": 874.0, "completions/mean_length": 538.0625, "completions/mean_terminated_length": 529.6958389282227, "completions/min_length": 195.5, "completions/min_terminated_length": 195.5, "epoch": 0.07333333333333333, "grad_norm": 0.7975037097930908, "kl": 0.14453125, "learning_rate": 1.46e-05, "loss": -0.0692, "num_tokens": 10846804.0, "reward": 1.0960131287574768, "reward_std": 0.4025039039552212, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.12348708137869835, "rewards/penalized_accuracy_reward/std": 0.3375791162252426, "rewards/reasoning_steps_reward/mean": 0.9583333432674408, "rewards/reasoning_steps_reward/std": 0.14949213340878487, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 220 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 844.75, "completions/max_terminated_length": 825.5, "completions/mean_length": 509.5, "completions/mean_terminated_length": 503.57188415527344, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.07366666666666667, "grad_norm": 0.48976442217826843, "kl": 0.140380859375, "learning_rate": 1.4666666666666666e-05, "loss": 0.0123, "num_tokens": 10888356.0, "reward": 1.0810218900442123, "reward_std": 0.21577799692749977, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.09026668965816498, "rewards/penalized_accuracy_reward/std": 0.1944141685962677, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 577.078125, "completions/mean_terminated_length": 577.078125, "completions/min_length": 369.5, "completions/min_terminated_length": 369.5, "epoch": 0.074, "grad_norm": 0.40557998418807983, "kl": 0.16455078125, "learning_rate": 1.4733333333333335e-05, "loss": 0.0062, "num_tokens": 10935321.0, "reward": 1.033362090587616, "reward_std": 0.13344836235046387, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03336208686232567, "rewards/penalized_accuracy_reward/std": 0.13344834744930267, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 897.75, "completions/max_terminated_length": 883.75, "completions/mean_length": 655.078125, "completions/mean_terminated_length": 647.2142944335938, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.07433333333333333, "grad_norm": 0.6898764967918396, "kl": 0.155517578125, "learning_rate": 1.48e-05, "loss": 0.0313, "num_tokens": 10985406.0, "reward": 1.0111503452062607, "reward_std": 0.15717457351274788, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.08539126068353653, "rewards/penalized_accuracy_reward/mean": 0.028598245233297348, "rewards/penalized_accuracy_reward/std": 0.11439298838376999, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07966229319572449, "step": 223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 893.25, "completions/max_terminated_length": 889.75, "completions/mean_length": 664.59375, "completions/mean_terminated_length": 660.8270874023438, "completions/min_length": 344.5, "completions/min_terminated_length": 344.5, "epoch": 0.07466666666666667, "grad_norm": 0.41226571798324585, "kl": 0.162841796875, "learning_rate": 1.4866666666666668e-05, "loss": -0.0158, "num_tokens": 11037940.0, "reward": 0.9873698055744171, "reward_std": 0.04121637064963579, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 920.25, "completions/max_terminated_length": 895.5, "completions/mean_length": 717.703125, "completions/mean_terminated_length": 703.4521179199219, "completions/min_length": 513.75, "completions/min_terminated_length": 513.75, "epoch": 0.075, "grad_norm": 0.6623892188072205, "kl": 0.146728515625, "learning_rate": 1.4933333333333335e-05, "loss": 0.0225, "num_tokens": 11091969.0, "reward": 0.9588541686534882, "reward_std": 0.12408353574573994, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.21039126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.058679524809122086, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11091229319572449, "step": 225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.25, "completions/max_terminated_length": 874.25, "completions/mean_length": 585.78125, "completions/mean_terminated_length": 585.78125, "completions/min_length": 329.25, "completions/min_terminated_length": 329.25, "epoch": 0.07533333333333334, "grad_norm": 0.5859063863754272, "kl": 0.143798828125, "learning_rate": 1.5000000000000002e-05, "loss": 0.0193, "num_tokens": 11138291.0, "reward": 1.2298401892185211, "reward_std": 0.44086357951164246, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2428610548377037, "rewards/penalized_accuracy_reward/std": 0.4119364768266678, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.1041666641831398, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.75, "completions/max_terminated_length": 897.75, "completions/mean_length": 602.875, "completions/mean_terminated_length": 602.875, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.07566666666666666, "grad_norm": 0.48775744438171387, "kl": 0.15869140625, "learning_rate": 1.5066666666666668e-05, "loss": 0.0067, "num_tokens": 11185931.0, "reward": 1.0926258563995361, "reward_std": 0.2955952286720276, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09262584149837494, "rewards/penalized_accuracy_reward/std": 0.2955952137708664, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 892.75, "completions/max_terminated_length": 887.25, "completions/mean_length": 649.046875, "completions/mean_terminated_length": 635.8930358886719, "completions/min_length": 361.5, "completions/min_terminated_length": 361.5, "epoch": 0.076, "grad_norm": 0.5001335740089417, "kl": 0.173095703125, "learning_rate": 1.5133333333333335e-05, "loss": 0.0474, "num_tokens": 11241678.0, "reward": 0.9748698025941849, "reward_std": 0.06366407126188278, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.10077822208404541, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.025194555521011353, "step": 228 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 891.25, "completions/max_terminated_length": 854.25, "completions/mean_length": 624.65625, "completions/mean_terminated_length": 614.6187744140625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.07633333333333334, "grad_norm": 0.47809383273124695, "kl": 0.1624755859375, "learning_rate": 1.5200000000000002e-05, "loss": -0.0097, "num_tokens": 11291640.0, "reward": 1.1923899203538895, "reward_std": 0.3330077975988388, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.21686908602714539, "rewards/penalized_accuracy_reward/std": 0.28924980759620667, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.0833333320915699, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 229 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.5, "completions/max_terminated_length": 843.5, "completions/mean_length": 557.65625, "completions/mean_terminated_length": 557.65625, "completions/min_length": 289.25, "completions/min_terminated_length": 289.25, "epoch": 0.07666666666666666, "grad_norm": 0.5777904987335205, "kl": 0.12939453125, "learning_rate": 1.5266666666666667e-05, "loss": -0.0835, "num_tokens": 11338354.0, "reward": 0.9869791865348816, "reward_std": 0.043496059253811836, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.08699213340878487, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 230 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 847.75, "completions/max_terminated_length": 777.75, "completions/mean_length": 503.15625, "completions/mean_terminated_length": 495.65834045410156, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.077, "grad_norm": 0.49381595849990845, "kl": 0.149169921875, "learning_rate": 1.5333333333333334e-05, "loss": 0.0168, "num_tokens": 11380140.0, "reward": 0.9829427152872086, "reward_std": 0.050058203749358654, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.0833333320915699, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 231 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.75, "completions/max_terminated_length": 814.75, "completions/mean_length": 585.375, "completions/mean_terminated_length": 585.375, "completions/min_length": 399.75, "completions/min_terminated_length": 399.75, "epoch": 0.07733333333333334, "grad_norm": 0.5595817565917969, "kl": 0.18212890625, "learning_rate": 1.54e-05, "loss": 0.0464, "num_tokens": 11428132.0, "reward": 1.1764193773269653, "reward_std": 0.33044473826885223, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17641937546432018, "rewards/penalized_accuracy_reward/std": 0.330444760620594, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.5, "completions/max_terminated_length": 817.5, "completions/mean_length": 592.75, "completions/mean_terminated_length": 592.75, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.07766666666666666, "grad_norm": 0.547507107257843, "kl": 0.152099609375, "learning_rate": 1.546666666666667e-05, "loss": 0.0036, "num_tokens": 11476884.0, "reward": 1.026512697339058, "reward_std": 0.1592193841934204, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03432518616318703, "rewards/penalized_accuracy_reward/std": 0.1373007446527481, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.0624999962747097, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 233 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.25, "completions/max_terminated_length": 990.25, "completions/mean_length": 614.59375, "completions/mean_terminated_length": 614.59375, "completions/min_length": 358.5, "completions/min_terminated_length": 358.5, "epoch": 0.078, "grad_norm": 0.4871361553668976, "kl": 0.1551513671875, "learning_rate": 1.5533333333333333e-05, "loss": -0.0094, "num_tokens": 11528666.0, "reward": 0.9895833432674408, "reward_std": 0.03506520017981529, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9791666865348816, "rewards/reasoning_steps_reward/std": 0.07013041526079178, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 234 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.75, "completions/max_terminated_length": 848.75, "completions/mean_length": 588.375, "completions/mean_terminated_length": 588.375, "completions/min_length": 317.5, "completions/min_terminated_length": 317.5, "epoch": 0.07833333333333334, "grad_norm": 0.7485732436180115, "kl": 0.16357421875, "learning_rate": 1.5600000000000003e-05, "loss": -0.0261, "num_tokens": 11580130.0, "reward": 1.0502691864967346, "reward_std": 0.28249461110681295, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06589415855705738, "rewards/penalized_accuracy_reward/std": 0.2635766342282295, "rewards/reasoning_steps_reward/mean": 0.9687500149011612, "rewards/reasoning_steps_reward/std": 0.1249999962747097, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 235 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 983.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 640.84375, "completions/mean_terminated_length": 627.8812713623047, "completions/min_length": 349.75, "completions/min_terminated_length": 349.75, "epoch": 0.07866666666666666, "grad_norm": 0.7337653636932373, "kl": 0.158203125, "learning_rate": 1.5666666666666667e-05, "loss": 0.0663, "num_tokens": 11631096.0, "reward": 0.9947942942380905, "reward_std": 0.16349013429135084, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.021877631545066833, "rewards/penalized_accuracy_reward/std": 0.08751052618026733, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.1041666641831398, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 236 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 918.25, "completions/max_terminated_length": 862.5, "completions/mean_length": 616.828125, "completions/mean_terminated_length": 609.6697998046875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.079, "grad_norm": 0.6300519108772278, "kl": 0.158203125, "learning_rate": 1.5733333333333334e-05, "loss": 0.0524, "num_tokens": 11680093.0, "reward": 0.9907552152872086, "reward_std": 0.036979163996875286, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 237 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 623.21875, "completions/mean_terminated_length": 623.21875, "completions/min_length": 373.75, "completions/min_terminated_length": 373.75, "epoch": 0.07933333333333334, "grad_norm": 0.5821474194526672, "kl": 0.167724609375, "learning_rate": 1.58e-05, "loss": -0.0171, "num_tokens": 11730379.0, "reward": 1.0398301482200623, "reward_std": 0.18846993148326874, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.055455148220062256, "rewards/penalized_accuracy_reward/std": 0.1535360962152481, "rewards/reasoning_steps_reward/mean": 0.9687500298023224, "rewards/reasoning_steps_reward/std": 0.11179707944393158, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 238 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 552.4375, "completions/mean_terminated_length": 552.4375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.07966666666666666, "grad_norm": 0.37174347043037415, "kl": 0.140869140625, "learning_rate": 1.586666666666667e-05, "loss": -0.055, "num_tokens": 11777623.0, "reward": 0.9921875149011612, "reward_std": 0.03124999161809683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.0624999962747097, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 239 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.75, "completions/max_terminated_length": 915.75, "completions/mean_length": 667.171875, "completions/mean_terminated_length": 667.171875, "completions/min_length": 443.25, "completions/min_terminated_length": 443.25, "epoch": 0.08, "grad_norm": 0.4442844092845917, "kl": 0.166748046875, "learning_rate": 1.5933333333333336e-05, "loss": 0.0116, "num_tokens": 11829234.0, "reward": 0.9973958432674408, "reward_std": 0.010416663251817226, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 240 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 587.765625, "completions/mean_terminated_length": 587.765625, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.08033333333333334, "grad_norm": 0.5305766463279724, "kl": 0.156005859375, "learning_rate": 1.6000000000000003e-05, "loss": -0.0256, "num_tokens": 11874867.0, "reward": 0.9921875149011612, "reward_std": 0.03124999161809683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.0624999962747097, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 241 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 948.0, "completions/max_terminated_length": 900.75, "completions/mean_length": 645.859375, "completions/mean_terminated_length": 624.7781372070312, "completions/min_length": 387.75, "completions/min_terminated_length": 387.75, "epoch": 0.08066666666666666, "grad_norm": 0.5956756472587585, "kl": 0.16650390625, "learning_rate": 1.606666666666667e-05, "loss": 0.0793, "num_tokens": 11923994.0, "reward": 1.2291074991226196, "reward_std": 0.497161440551281, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.17430340498685837, "rewards/penalized_accuracy_reward/mean": 0.26309189945459366, "rewards/penalized_accuracy_reward/std": 0.476230688393116, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07482585124671459, "step": 242 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 975.75, "completions/max_terminated_length": 889.0, "completions/mean_length": 740.203125, "completions/mean_terminated_length": 730.7469024658203, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.081, "grad_norm": 0.6802368760108948, "kl": 0.1600341796875, "learning_rate": 1.6133333333333334e-05, "loss": 0.0122, "num_tokens": 11983095.0, "reward": 1.0122395604848862, "reward_std": 0.15847744420170784, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.03593749925494194, "rewards/penalized_accuracy_reward/std": 0.09819994866847992, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.048112526535987854, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 243 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 860.25, "completions/max_terminated_length": 860.0, "completions/mean_length": 608.390625, "completions/mean_terminated_length": 600.5848236083984, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.08133333333333333, "grad_norm": 0.6472396850585938, "kl": 0.15234375, "learning_rate": 1.62e-05, "loss": 0.0666, "num_tokens": 12032208.0, "reward": 1.2856135964393616, "reward_std": 0.19508842751383781, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.08539126068353653, "rewards/penalized_accuracy_reward/mean": 0.30410313606262207, "rewards/penalized_accuracy_reward/std": 0.16097909212112427, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.021347815170884132, "step": 244 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 974.75, "completions/max_terminated_length": 957.5, "completions/mean_length": 700.984375, "completions/mean_terminated_length": 691.9833526611328, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.08166666666666667, "grad_norm": 0.6965994834899902, "kl": 0.1689453125, "learning_rate": 1.6266666666666668e-05, "loss": 0.0147, "num_tokens": 12086783.0, "reward": 0.9736979007720947, "reward_std": 0.08700593560934067, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.1041666641831398, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 245 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 912.5, "completions/max_terminated_length": 907.0, "completions/mean_length": 660.859375, "completions/mean_terminated_length": 656.5062561035156, "completions/min_length": 365.75, "completions/min_terminated_length": 365.75, "epoch": 0.082, "grad_norm": 0.7551724314689636, "kl": 0.1632080078125, "learning_rate": 1.6333333333333335e-05, "loss": 0.0243, "num_tokens": 12141766.0, "reward": 1.347618505358696, "reward_std": 0.7300410941243172, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.36806120723485947, "rewards/penalized_accuracy_reward/std": 0.6883421540260315, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.1041666641831398, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 246 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 622.515625, "completions/mean_terminated_length": 622.515625, "completions/min_length": 336.75, "completions/min_terminated_length": 336.75, "epoch": 0.08233333333333333, "grad_norm": 0.4745469093322754, "kl": 0.16064453125, "learning_rate": 1.64e-05, "loss": 0.0349, "num_tokens": 12192311.0, "reward": 1.404757171869278, "reward_std": 0.5057430565357208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.4073612689971924, "rewards/penalized_accuracy_reward/std": 0.5086499452590942, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 247 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.75, "completions/max_terminated_length": 822.75, "completions/mean_length": 583.453125, "completions/mean_terminated_length": 583.453125, "completions/min_length": 403.5, "completions/min_terminated_length": 403.5, "epoch": 0.08266666666666667, "grad_norm": 0.7214362621307373, "kl": 0.17041015625, "learning_rate": 1.646666666666667e-05, "loss": -0.0147, "num_tokens": 12238516.0, "reward": 1.0527930855751038, "reward_std": 0.23437042441219091, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.07024098187685013, "rewards/penalized_accuracy_reward/std": 0.19193504750728607, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.0833333320915699, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.021347815170884132, "step": 248 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 854.75, "completions/max_terminated_length": 828.75, "completions/mean_length": 616.859375, "completions/mean_terminated_length": 612.0948028564453, "completions/min_length": 371.5, "completions/min_terminated_length": 371.5, "epoch": 0.083, "grad_norm": 0.6579822897911072, "kl": 0.1611328125, "learning_rate": 1.6533333333333333e-05, "loss": 0.0076, "num_tokens": 12285963.0, "reward": 1.0059796571731567, "reward_std": 0.15834287833422422, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.08539126068353653, "rewards/penalized_accuracy_reward/mean": 0.027073414996266365, "rewards/penalized_accuracy_reward/std": 0.10829365998506546, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.0624999962747097, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.021347815170884132, "step": 249 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 777.75, "completions/max_terminated_length": 750.0, "completions/mean_length": 547.78125, "completions/mean_terminated_length": 542.2343902587891, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.08333333333333333, "grad_norm": 0.9504538774490356, "kl": 0.186279296875, "learning_rate": 1.66e-05, "loss": 0.0474, "num_tokens": 12331069.0, "reward": 0.9817708432674408, "reward_std": 0.06432939507067204, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.06615880131721497, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.5, "completions/max_terminated_length": 752.5, "completions/mean_length": 485.703125, "completions/mean_terminated_length": 485.703125, "completions/min_length": 248.75, "completions/min_terminated_length": 248.75, "epoch": 0.08366666666666667, "grad_norm": 0.9305344223976135, "kl": 0.1630859375, "learning_rate": 1.6666666666666667e-05, "loss": -0.0476, "num_tokens": 12374634.0, "reward": 1.0772633850574493, "reward_std": 0.3262657858431339, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11111752688884735, "rewards/penalized_accuracy_reward/std": 0.23903486132621765, "rewards/reasoning_steps_reward/mean": 0.9322916865348816, "rewards/reasoning_steps_reward/std": 0.17446180433034897, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 251 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.75, "completions/max_terminated_length": 704.75, "completions/mean_length": 485.328125, "completions/mean_terminated_length": 485.328125, "completions/min_length": 288.25, "completions/min_terminated_length": 288.25, "epoch": 0.084, "grad_norm": 0.8013951778411865, "kl": 0.16162109375, "learning_rate": 1.6733333333333335e-05, "loss": -0.0377, "num_tokens": 12416351.0, "reward": 1.0147216022014618, "reward_std": 0.204719758592546, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03295077010989189, "rewards/penalized_accuracy_reward/std": 0.13180309534072876, "rewards/reasoning_steps_reward/mean": 0.9635416716337204, "rewards/reasoning_steps_reward/std": 0.1458333320915699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 252 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.5, "completions/max_terminated_length": 783.5, "completions/mean_length": 513.40625, "completions/mean_terminated_length": 513.40625, "completions/min_length": 274.5, "completions/min_terminated_length": 274.5, "epoch": 0.08433333333333333, "grad_norm": 0.12178342789411545, "kl": 0.175048828125, "learning_rate": 1.6800000000000002e-05, "loss": 0.007, "num_tokens": 12462121.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 253 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.75, "completions/max_terminated_length": 851.75, "completions/mean_length": 581.484375, "completions/mean_terminated_length": 581.484375, "completions/min_length": 353.5, "completions/min_terminated_length": 353.5, "epoch": 0.08466666666666667, "grad_norm": 0.6153361797332764, "kl": 0.2001953125, "learning_rate": 1.686666666666667e-05, "loss": 0.0297, "num_tokens": 12508840.0, "reward": 1.1047951132059097, "reward_std": 0.2661496289074421, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.11143571883440018, "rewards/penalized_accuracy_reward/std": 0.23958711326122284, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.25, "completions/max_terminated_length": 811.25, "completions/mean_length": 496.40625, "completions/mean_terminated_length": 496.40625, "completions/min_length": 261.75, "completions/min_terminated_length": 261.75, "epoch": 0.085, "grad_norm": 0.6173115372657776, "kl": 0.173095703125, "learning_rate": 1.6933333333333336e-05, "loss": -0.0366, "num_tokens": 12548482.0, "reward": 1.1505606770515442, "reward_std": 0.35186809953302145, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1531648486852646, "rewards/penalized_accuracy_reward/std": 0.3414514288306236, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 255 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.75, "completions/max_terminated_length": 615.75, "completions/mean_length": 392.28125, "completions/mean_terminated_length": 392.28125, "completions/min_length": 216.25, "completions/min_terminated_length": 216.25, "epoch": 0.08533333333333333, "grad_norm": 0.7294853329658508, "kl": 0.1728515625, "learning_rate": 1.7e-05, "loss": -0.0149, "num_tokens": 12582468.0, "reward": 1.2870292961597443, "reward_std": 0.31009191926568747, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2896333932876587, "rewards/penalized_accuracy_reward/std": 0.29967525601387024, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 256 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.5, "completions/max_terminated_length": 762.5, "completions/mean_length": 523.546875, "completions/mean_terminated_length": 523.546875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.08566666666666667, "grad_norm": 0.32328730821609497, "kl": 0.1884765625, "learning_rate": 1.706666666666667e-05, "loss": 0.0309, "num_tokens": 12629335.0, "reward": 1.0314277112483978, "reward_std": 0.1257108747959137, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.031427718698978424, "rewards/penalized_accuracy_reward/std": 0.1257108747959137, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 257 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 774.25, "completions/max_terminated_length": 746.5, "completions/mean_length": 553.3125, "completions/mean_terminated_length": 548.5385437011719, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.086, "grad_norm": 0.8301519155502319, "kl": 0.17578125, "learning_rate": 1.7133333333333334e-05, "loss": 0.0291, "num_tokens": 12675451.0, "reward": 1.024748146533966, "reward_std": 0.15378290473017842, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.03438352793455124, "rewards/penalized_accuracy_reward/std": 0.13753412663936615, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 258 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 757.5, "completions/max_terminated_length": 722.75, "completions/mean_length": 498.90625, "completions/mean_terminated_length": 493.2687530517578, "completions/min_length": 303.75, "completions/min_terminated_length": 303.75, "epoch": 0.08633333333333333, "grad_norm": 0.6008172631263733, "kl": 0.219970703125, "learning_rate": 1.72e-05, "loss": 0.01, "num_tokens": 12717413.0, "reward": 1.2428324222564697, "reward_std": 0.4229099154472351, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.2520771995186806, "rewards/penalized_accuracy_reward/std": 0.41564619541168213, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 259 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.25, "completions/max_terminated_length": 803.25, "completions/mean_length": 598.71875, "completions/mean_terminated_length": 598.71875, "completions/min_length": 417.75, "completions/min_terminated_length": 417.75, "epoch": 0.08666666666666667, "grad_norm": 0.42760568857192993, "kl": 0.21142578125, "learning_rate": 1.726666666666667e-05, "loss": 0.0179, "num_tokens": 12765491.0, "reward": 1.0692082047462463, "reward_std": 0.2115136981010437, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.07584881037473679, "rewards/penalized_accuracy_reward/std": 0.20726299285888672, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 260 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 563.328125, "completions/mean_terminated_length": 563.328125, "completions/min_length": 325.5, "completions/min_terminated_length": 325.5, "epoch": 0.087, "grad_norm": 0.5432370901107788, "kl": 0.1748046875, "learning_rate": 1.7333333333333336e-05, "loss": 0.0149, "num_tokens": 12810776.0, "reward": 0.9929687529802322, "reward_std": 0.02812499925494194, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 261 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 882.75, "completions/max_terminated_length": 880.25, "completions/mean_length": 624.921875, "completions/mean_terminated_length": 619.2250061035156, "completions/min_length": 424.75, "completions/min_terminated_length": 424.75, "epoch": 0.08733333333333333, "grad_norm": 0.61566561460495, "kl": 0.185791015625, "learning_rate": 1.7400000000000003e-05, "loss": 0.0556, "num_tokens": 12859571.0, "reward": 1.3334292024374008, "reward_std": 0.586882371455431, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.34006984531879425, "rewards/penalized_accuracy_reward/std": 0.5603198409080505, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 262 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.5, "completions/max_terminated_length": 780.5, "completions/mean_length": 567.390625, "completions/mean_terminated_length": 567.390625, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.08766666666666667, "grad_norm": 0.4211113154888153, "kl": 0.180419921875, "learning_rate": 1.7466666666666667e-05, "loss": 0.0048, "num_tokens": 12904428.0, "reward": 0.9907552152872086, "reward_std": 0.036979163996875286, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 263 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 724.25, "completions/max_terminated_length": 714.5, "completions/mean_length": 522.640625, "completions/mean_terminated_length": 516.8020935058594, "completions/min_length": 365.25, "completions/min_terminated_length": 365.25, "epoch": 0.088, "grad_norm": 0.3019135296344757, "kl": 0.176513671875, "learning_rate": 1.7533333333333337e-05, "loss": 0.0236, "num_tokens": 12948645.0, "reward": 1.027706801891327, "reward_std": 0.1416618824005127, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.03434742987155914, "rewards/penalized_accuracy_reward/std": 0.13738973438739777, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 264 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 913.75, "completions/max_terminated_length": 875.0, "completions/mean_length": 612.90625, "completions/mean_terminated_length": 607.7541809082031, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.08833333333333333, "grad_norm": 0.47055670619010925, "kl": 0.1806640625, "learning_rate": 1.76e-05, "loss": 0.0249, "num_tokens": 12999887.0, "reward": 1.103381261229515, "reward_std": 0.2237912304699421, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.11002188175916672, "rewards/penalized_accuracy_reward/std": 0.19722871482372284, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 265 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.5, "completions/max_terminated_length": 803.5, "completions/mean_length": 576.75, "completions/mean_terminated_length": 576.75, "completions/min_length": 412.5, "completions/min_terminated_length": 412.5, "epoch": 0.08866666666666667, "grad_norm": 0.40757638216018677, "kl": 0.2119140625, "learning_rate": 1.7666666666666668e-05, "loss": -0.0054, "num_tokens": 13047551.0, "reward": 0.9933593720197678, "reward_std": 0.02656250074505806, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 266 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 890.25, "completions/max_terminated_length": 780.5, "completions/mean_length": 595.78125, "completions/mean_terminated_length": 581.792724609375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.089, "grad_norm": 0.6589139103889465, "kl": 0.158447265625, "learning_rate": 1.7733333333333335e-05, "loss": 0.0656, "num_tokens": 13095889.0, "reward": 1.192888155579567, "reward_std": 0.20832654368132353, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.2225756049156189, "rewards/penalized_accuracy_reward/std": 0.1634555160999298, "rewards/reasoning_steps_reward/mean": 0.9687500149011612, "rewards/reasoning_steps_reward/std": 0.0936010368168354, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 267 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 513.046875, "completions/mean_terminated_length": 513.046875, "completions/min_length": 287.75, "completions/min_terminated_length": 287.75, "epoch": 0.08933333333333333, "grad_norm": 0.5512727499008179, "kl": 0.167236328125, "learning_rate": 1.7800000000000002e-05, "loss": -0.0193, "num_tokens": 13145492.0, "reward": 1.1362604349851608, "reward_std": 0.3192702382802963, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14928126335144043, "rewards/penalized_accuracy_reward/std": 0.2671869397163391, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.1041666641831398, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 268 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.25, "completions/max_terminated_length": 777.25, "completions/mean_length": 584.609375, "completions/mean_terminated_length": 584.609375, "completions/min_length": 351.75, "completions/min_terminated_length": 351.75, "epoch": 0.08966666666666667, "grad_norm": 0.5141566395759583, "kl": 0.194580078125, "learning_rate": 1.7866666666666666e-05, "loss": -0.0297, "num_tokens": 13196363.0, "reward": 0.9873698055744171, "reward_std": 0.03819324728101492, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.033994100987911224, "step": 269 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 831.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 570.375, "completions/mean_terminated_length": 561.8895874023438, "completions/min_length": 363.25, "completions/min_terminated_length": 363.25, "epoch": 0.09, "grad_norm": 0.4032162129878998, "kl": 0.18017578125, "learning_rate": 1.7933333333333333e-05, "loss": -0.0049, "num_tokens": 13244691.0, "reward": 1.3360155820846558, "reward_std": 0.20632131397724152, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.3464322090148926, "rewards/penalized_accuracy_reward/std": 0.20845253765583038, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 270 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 883.0, "completions/max_terminated_length": 798.75, "completions/mean_length": 498.546875, "completions/mean_terminated_length": 490.7093811035156, "completions/min_length": 308.5, "completions/min_terminated_length": 308.5, "epoch": 0.09033333333333333, "grad_norm": 0.7402020692825317, "kl": 0.1796875, "learning_rate": 1.8e-05, "loss": 0.0827, "num_tokens": 13286470.0, "reward": 1.0911231935024261, "reward_std": 0.22846381599083543, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.10049816966056824, "rewards/penalized_accuracy_reward/std": 0.21616530418395996, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.125, "step": 271 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.75, "completions/max_terminated_length": 751.75, "completions/mean_length": 562.796875, "completions/mean_terminated_length": 562.796875, "completions/min_length": 408.75, "completions/min_terminated_length": 408.75, "epoch": 0.09066666666666667, "grad_norm": 0.3422425091266632, "kl": 0.206298828125, "learning_rate": 1.8066666666666668e-05, "loss": 0.0166, "num_tokens": 13336505.0, "reward": 1.0235688090324402, "reward_std": 0.1415824592113495, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0339854471385479, "rewards/penalized_accuracy_reward/std": 0.13594180345535278, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.05692750960588455, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 272 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.5, "completions/max_terminated_length": 767.5, "completions/mean_length": 544.65625, "completions/mean_terminated_length": 544.65625, "completions/min_length": 337.75, "completions/min_terminated_length": 337.75, "epoch": 0.091, "grad_norm": 0.23651820421218872, "kl": 0.1640625, "learning_rate": 1.8133333333333335e-05, "loss": -0.0169, "num_tokens": 13380803.0, "reward": 0.9903645813465118, "reward_std": 0.02933359704911709, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 273 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.75, "completions/max_terminated_length": 786.75, "completions/mean_length": 498.234375, "completions/mean_terminated_length": 498.234375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.09133333333333334, "grad_norm": 0.6377527117729187, "kl": 0.165283203125, "learning_rate": 1.8200000000000002e-05, "loss": -0.0266, "num_tokens": 13425106.0, "reward": 1.054260477423668, "reward_std": 0.27288868278265, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.07652609050273895, "rewards/penalized_accuracy_reward/std": 0.2091090977191925, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.045325469225645065, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 274 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 485.234375, "completions/mean_terminated_length": 485.234375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.09166666666666666, "grad_norm": 0.6798899173736572, "kl": 0.169189453125, "learning_rate": 1.826666666666667e-05, "loss": -0.0909, "num_tokens": 13464209.0, "reward": 0.9557291865348816, "reward_std": 0.11883590277284384, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.911458358168602, "rewards/reasoning_steps_reward/std": 0.23767182603478432, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 275 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.75, "completions/max_terminated_length": 796.75, "completions/mean_length": 568.765625, "completions/mean_terminated_length": 568.765625, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.092, "grad_norm": 0.5649985074996948, "kl": 0.16259765625, "learning_rate": 1.8333333333333333e-05, "loss": -0.0531, "num_tokens": 13511362.0, "reward": 0.9759114682674408, "reward_std": 0.09635416325181723, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.0833333320915699, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 276 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 896.25, "completions/max_terminated_length": 881.0, "completions/mean_length": 666.0625, "completions/mean_terminated_length": 661.7218780517578, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.09233333333333334, "grad_norm": 0.5934914946556091, "kl": 0.18359375, "learning_rate": 1.8400000000000003e-05, "loss": 0.0345, "num_tokens": 13564806.0, "reward": 0.9390624910593033, "reward_std": 0.11813730373978615, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.27289126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.09526265040040016, "step": 277 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.75, "completions/max_terminated_length": 801.75, "completions/mean_length": 603.125, "completions/mean_terminated_length": 603.125, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.09266666666666666, "grad_norm": 0.8149129152297974, "kl": 0.19921875, "learning_rate": 1.8466666666666667e-05, "loss": 0.0618, "num_tokens": 13612222.0, "reward": 0.9855468720197678, "reward_std": 0.057812500395812094, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 278 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 900.5, "completions/max_terminated_length": 885.5, "completions/mean_length": 724.78125, "completions/mean_terminated_length": 717.1986694335938, "completions/min_length": 517.25, "completions/min_terminated_length": 517.25, "epoch": 0.093, "grad_norm": 0.3915879726409912, "kl": 0.1728515625, "learning_rate": 1.8533333333333334e-05, "loss": 0.018, "num_tokens": 13667648.0, "reward": 0.9841145873069763, "reward_std": 0.0467079458758235, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.08539126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.021347815170884132, "step": 279 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 907.75, "completions/max_terminated_length": 887.75, "completions/mean_length": 635.84375, "completions/mean_terminated_length": 630.9354248046875, "completions/min_length": 325.5, "completions/min_terminated_length": 325.5, "epoch": 0.09333333333333334, "grad_norm": 0.4796634614467621, "kl": 0.1650390625, "learning_rate": 1.86e-05, "loss": 0.0293, "num_tokens": 13720150.0, "reward": 0.9566406309604645, "reward_std": 0.09092025086283684, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.1971946656703949, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1265372931957245, "step": 280 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 839.75, "completions/max_terminated_length": 807.0, "completions/mean_length": 557.234375, "completions/mean_terminated_length": 547.1428680419922, "completions/min_length": 304.5, "completions/min_terminated_length": 304.5, "epoch": 0.09366666666666666, "grad_norm": 0.6499218940734863, "kl": 0.19873046875, "learning_rate": 1.866666666666667e-05, "loss": -0.005, "num_tokens": 13764053.0, "reward": 1.1594230234622955, "reward_std": 0.308895755559206, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.10077822208404541, "rewards/penalized_accuracy_reward/mean": 0.2009594738483429, "rewards/penalized_accuracy_reward/std": 0.20988135039806366, "rewards/reasoning_steps_reward/mean": 0.9583333432674408, "rewards/reasoning_steps_reward/std": 0.10885214060544968, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.049575019627809525, "step": 281 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 823.75, "completions/max_terminated_length": 815.0, "completions/mean_length": 603.921875, "completions/mean_terminated_length": 598.4656372070312, "completions/min_length": 353.5, "completions/min_terminated_length": 353.5, "epoch": 0.094, "grad_norm": 0.4768775403499603, "kl": 0.1806640625, "learning_rate": 1.8733333333333336e-05, "loss": 0.0299, "num_tokens": 13813200.0, "reward": 1.022420957684517, "reward_std": 0.18214058130979538, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.037915751338005066, "rewards/penalized_accuracy_reward/std": 0.15166300535202026, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 282 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.25, "completions/max_terminated_length": 810.25, "completions/mean_length": 616.890625, "completions/mean_terminated_length": 616.890625, "completions/min_length": 406.25, "completions/min_terminated_length": 406.25, "epoch": 0.09433333333333334, "grad_norm": 0.4238940179347992, "kl": 0.180419921875, "learning_rate": 1.88e-05, "loss": -0.0062, "num_tokens": 13862537.0, "reward": 0.990364596247673, "reward_std": 0.03854166250675917, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 283 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.75, "completions/max_terminated_length": 802.75, "completions/mean_length": 610.5, "completions/mean_terminated_length": 610.5, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.09466666666666666, "grad_norm": 0.3470335900783539, "kl": 0.1865234375, "learning_rate": 1.886666666666667e-05, "loss": 0.0145, "num_tokens": 13910329.0, "reward": 0.9973958432674408, "reward_std": 0.010416663251817226, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 284 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.5, "completions/max_terminated_length": 790.5, "completions/mean_length": 569.078125, "completions/mean_terminated_length": 569.078125, "completions/min_length": 363.25, "completions/min_terminated_length": 363.25, "epoch": 0.095, "grad_norm": 0.4298518896102905, "kl": 0.1962890625, "learning_rate": 1.8933333333333334e-05, "loss": 0.0273, "num_tokens": 13955998.0, "reward": 0.9911458343267441, "reward_std": 0.026434535160660744, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 285 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.5, "completions/max_terminated_length": 746.5, "completions/mean_length": 500.328125, "completions/mean_terminated_length": 500.328125, "completions/min_length": 272.25, "completions/min_terminated_length": 272.25, "epoch": 0.09533333333333334, "grad_norm": 0.36656492948532104, "kl": 0.195556640625, "learning_rate": 1.9e-05, "loss": 0.0052, "num_tokens": 13999267.0, "reward": 1.035222053527832, "reward_std": 0.1523541957139969, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03782619908452034, "rewards/penalized_accuracy_reward/std": 0.15130481123924255, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 286 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 480.203125, "completions/mean_terminated_length": 480.203125, "completions/min_length": 238.75, "completions/min_terminated_length": 238.75, "epoch": 0.09566666666666666, "grad_norm": 0.7211626172065735, "kl": 0.20703125, "learning_rate": 1.9066666666666668e-05, "loss": -0.0091, "num_tokens": 14044032.0, "reward": 0.9789062440395355, "reward_std": 0.08437500149011612, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 287 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 398.984375, "completions/mean_terminated_length": 398.984375, "completions/min_length": 206.5, "completions/min_terminated_length": 206.5, "epoch": 0.096, "grad_norm": 0.7898841500282288, "kl": 0.2197265625, "learning_rate": 1.9133333333333335e-05, "loss": -0.068, "num_tokens": 14079407.0, "reward": 0.9791666865348816, "reward_std": 0.06615879014134407, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9583333432674408, "rewards/reasoning_steps_reward/std": 0.13231760263442993, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 288 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.25, "completions/max_terminated_length": 661.25, "completions/mean_length": 460.09375, "completions/mean_terminated_length": 460.09375, "completions/min_length": 265.5, "completions/min_terminated_length": 265.5, "epoch": 0.09633333333333334, "grad_norm": 0.4932043254375458, "kl": 0.185791015625, "learning_rate": 1.9200000000000003e-05, "loss": -0.0723, "num_tokens": 14120565.0, "reward": 1.018554836511612, "reward_std": 0.13671931624412537, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02636732906103134, "rewards/penalized_accuracy_reward/std": 0.10546931624412537, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 289 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.75, "completions/max_terminated_length": 639.75, "completions/mean_length": 482.90625, "completions/mean_terminated_length": 482.90625, "completions/min_length": 266.25, "completions/min_terminated_length": 266.25, "epoch": 0.09666666666666666, "grad_norm": 0.1181245893239975, "kl": 0.24072265625, "learning_rate": 1.926666666666667e-05, "loss": 0.0096, "num_tokens": 14161055.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 290 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.25, "completions/max_terminated_length": 711.25, "completions/mean_length": 523.328125, "completions/mean_terminated_length": 523.328125, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.097, "grad_norm": 0.3490356504917145, "kl": 0.21484375, "learning_rate": 1.9333333333333333e-05, "loss": 0.0178, "num_tokens": 14204964.0, "reward": 1.052797555923462, "reward_std": 0.144706130027771, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.05279757082462311, "rewards/penalized_accuracy_reward/std": 0.1447061449289322, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 291 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.75, "completions/max_terminated_length": 787.75, "completions/mean_length": 582.15625, "completions/mean_terminated_length": 582.15625, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.09733333333333333, "grad_norm": 0.4878210723400116, "kl": 0.20556640625, "learning_rate": 1.94e-05, "loss": 0.0187, "num_tokens": 14253038.0, "reward": 1.0754664540290833, "reward_std": 0.28016388416290283, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.08210709318518639, "rewards/penalized_accuracy_reward/std": 0.27484729140996933, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 292 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.75, "completions/max_terminated_length": 706.75, "completions/mean_length": 535.59375, "completions/mean_terminated_length": 535.59375, "completions/min_length": 408.75, "completions/min_terminated_length": 408.75, "epoch": 0.09766666666666667, "grad_norm": 0.5533336400985718, "kl": 0.20703125, "learning_rate": 1.9466666666666668e-05, "loss": 0.0085, "num_tokens": 14296564.0, "reward": 1.1724434942007065, "reward_std": 0.27161475270986557, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.18168824911117554, "rewards/penalized_accuracy_reward/std": 0.24373677372932434, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 293 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.75, "completions/max_terminated_length": 692.75, "completions/mean_length": 539.421875, "completions/mean_terminated_length": 539.421875, "completions/min_length": 375.5, "completions/min_terminated_length": 375.5, "epoch": 0.098, "grad_norm": 0.14699687063694, "kl": 0.2314453125, "learning_rate": 1.9533333333333335e-05, "loss": 0.0093, "num_tokens": 14340399.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 294 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.5, "completions/max_terminated_length": 831.5, "completions/mean_length": 598.734375, "completions/mean_terminated_length": 598.734375, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.09833333333333333, "grad_norm": 0.5051561594009399, "kl": 0.232177734375, "learning_rate": 1.9600000000000002e-05, "loss": 0.0006, "num_tokens": 14387806.0, "reward": 0.9841145873069763, "reward_std": 0.0467079458758235, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.08539126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.021347815170884132, "step": 295 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.75, "completions/max_terminated_length": 937.75, "completions/mean_length": 716.828125, "completions/mean_terminated_length": 716.828125, "completions/min_length": 512.75, "completions/min_terminated_length": 512.75, "epoch": 0.09866666666666667, "grad_norm": 0.33059147000312805, "kl": 0.159423828125, "learning_rate": 1.9666666666666666e-05, "loss": 0.002, "num_tokens": 14444563.0, "reward": 0.9973958432674408, "reward_std": 0.010416663251817226, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 296 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 911.75, "completions/max_terminated_length": 848.5, "completions/mean_length": 708.296875, "completions/mean_terminated_length": 702.8958435058594, "completions/min_length": 485.5, "completions/min_terminated_length": 485.5, "epoch": 0.099, "grad_norm": 0.6231974363327026, "kl": 0.19970703125, "learning_rate": 1.9733333333333336e-05, "loss": 0.0085, "num_tokens": 14499014.0, "reward": 0.927083358168602, "reward_std": 0.15352921932935715, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.2979728877544403, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9791666865348816, "rewards/reasoning_steps_reward/std": 0.0833333283662796, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.13525452837347984, "step": 297 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 899.0, "completions/max_terminated_length": 858.5, "completions/mean_length": 623.4375, "completions/mean_terminated_length": 616.4937591552734, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.09933333333333333, "grad_norm": 1.3560590744018555, "kl": 0.213134765625, "learning_rate": 1.98e-05, "loss": -0.0724, "num_tokens": 14548466.0, "reward": 0.6550781428813934, "reward_std": 0.18048088252544403, "rewards/format_reward/mean": 0.234375, "rewards/format_reward/std": 0.4006601870059967, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.61328125, "rewards/tag_count_reward/std": 0.2045094631612301, "step": 298 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.5, "completions/max_terminated_length": 873.5, "completions/mean_length": 669.78125, "completions/mean_terminated_length": 669.78125, "completions/min_length": 468.75, "completions/min_terminated_length": 468.75, "epoch": 0.09966666666666667, "grad_norm": 0.15527978539466858, "kl": 0.196044921875, "learning_rate": 1.9866666666666667e-05, "loss": 0.0078, "num_tokens": 14603700.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 299 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 869.875, "completions/mean_terminated_length": 833.1428833007812, "completions/min_length": 634.25, "completions/min_terminated_length": 634.25, "epoch": 0.1, "grad_norm": 0.5940057635307312, "kl": 0.183349609375, "learning_rate": 1.9933333333333334e-05, "loss": 0.0858, "num_tokens": 14673020.0, "reward": 0.919921875, "reward_std": 0.16870027035474777, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.3943893313407898, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.11124361865222454, "step": 300 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 718.484375, "completions/mean_terminated_length": 718.484375, "completions/min_length": 521.25, "completions/min_terminated_length": 521.25, "epoch": 0.10033333333333333, "grad_norm": 0.4100304841995239, "kl": 0.172119140625, "learning_rate": 2e-05, "loss": 0.0099, "num_tokens": 14731275.0, "reward": 1.0655567944049835, "reward_std": 0.2622271999716759, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06555680371820927, "rewards/penalized_accuracy_reward/std": 0.26222722977399826, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 301 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 975.25, "completions/max_terminated_length": 947.75, "completions/mean_length": 798.65625, "completions/mean_terminated_length": 758.5296630859375, "completions/min_length": 491.5, "completions/min_terminated_length": 491.5, "epoch": 0.10066666666666667, "grad_norm": 0.6387519836425781, "kl": 0.197509765625, "learning_rate": 1.999999323072477e-05, "loss": 0.0339, "num_tokens": 14793445.0, "reward": 0.7894531488418579, "reward_std": 0.2002791464328766, "rewards/format_reward/mean": 0.59375, "rewards/format_reward/std": 0.46449070423841476, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.51953125, "rewards/tag_count_reward/std": 0.2541043721139431, "step": 302 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 942.25, "completions/max_terminated_length": 878.5, "completions/mean_length": 685.21875, "completions/mean_terminated_length": 645.7833557128906, "completions/min_length": 409.25, "completions/min_terminated_length": 409.25, "epoch": 0.101, "grad_norm": 0.7488775849342346, "kl": 0.199951171875, "learning_rate": 1.999997292290824e-05, "loss": 0.166, "num_tokens": 14846803.0, "reward": 0.9568192660808563, "reward_std": 0.272556833922863, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.29237766563892365, "rewards/penalized_accuracy_reward/mean": 0.05642864480614662, "rewards/penalized_accuracy_reward/std": 0.15471996366977692, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.75390625, "rewards/tag_count_reward/std": 0.3052690625190735, "step": 303 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.25, "completions/max_terminated_length": 793.25, "completions/mean_length": 533.421875, "completions/mean_terminated_length": 533.421875, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.10133333333333333, "grad_norm": 0.760302722454071, "kl": 0.222900390625, "learning_rate": 1.9999939076577906e-05, "loss": 0.0037, "num_tokens": 14893182.0, "reward": 1.0884817838668823, "reward_std": 0.4543069154024124, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.10671091079711914, "rewards/penalized_accuracy_reward/std": 0.42684365808963776, "rewards/reasoning_steps_reward/mean": 0.9635416716337204, "rewards/reasoning_steps_reward/std": 0.11148427054286003, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.5, "completions/max_terminated_length": 756.5, "completions/mean_length": 585.265625, "completions/mean_terminated_length": 585.265625, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.10166666666666667, "grad_norm": 0.5774600505828857, "kl": 0.211669921875, "learning_rate": 1.999989169177959e-05, "loss": -0.0124, "num_tokens": 14937551.0, "reward": 0.9802083224058151, "reward_std": 0.06704528210684657, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9635416716337204, "rewards/reasoning_steps_reward/std": 0.12704972177743912, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.042695630341768265, "step": 305 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 473.1875, "completions/mean_terminated_length": 473.1875, "completions/min_length": 286.75, "completions/min_terminated_length": 286.75, "epoch": 0.102, "grad_norm": 0.7446598410606384, "kl": 0.22705078125, "learning_rate": 1.9999830768577445e-05, "loss": 0.0203, "num_tokens": 14978139.0, "reward": 1.1352150440216064, "reward_std": 0.26945349760353565, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.19732439517974854, "rewards/penalized_accuracy_reward/std": 0.18291671574115753, "rewards/reasoning_steps_reward/mean": 0.90625, "rewards/reasoning_steps_reward/std": 0.18649740889668465, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07668973132967949, "step": 306 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.25, "completions/max_terminated_length": 740.25, "completions/mean_length": 507.578125, "completions/mean_terminated_length": 507.578125, "completions/min_length": 329.25, "completions/min_terminated_length": 329.25, "epoch": 0.10233333333333333, "grad_norm": 0.6429900527000427, "kl": 0.20166015625, "learning_rate": 1.9999756307053947e-05, "loss": -0.0358, "num_tokens": 15020304.0, "reward": 1.074374109506607, "reward_std": 0.2731225108727813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.10419180989265442, "rewards/penalized_accuracy_reward/std": 0.22402693331241608, "rewards/reasoning_steps_reward/mean": 0.9427083432674408, "rewards/reasoning_steps_reward/std": 0.13398722559213638, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.03697281517088413, "step": 307 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 731.25, "completions/max_terminated_length": 692.75, "completions/mean_length": 516.828125, "completions/mean_terminated_length": 509.7354278564453, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.10266666666666667, "grad_norm": 0.7568418979644775, "kl": 0.234619140625, "learning_rate": 1.999966830730992e-05, "loss": 0.0749, "num_tokens": 15062277.0, "reward": 0.9467447847127914, "reward_std": 0.11273385118693113, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9114583432674408, "rewards/reasoning_steps_reward/std": 0.18772627413272858, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.08538510836660862, "step": 308 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.25, "completions/max_terminated_length": 881.25, "completions/mean_length": 571.71875, "completions/mean_terminated_length": 571.71875, "completions/min_length": 352.25, "completions/min_terminated_length": 352.25, "epoch": 0.103, "grad_norm": 1.1642966270446777, "kl": 0.25732421875, "learning_rate": 1.9999566769464483e-05, "loss": 0.0204, "num_tokens": 15111011.0, "reward": 0.9882493168115616, "reward_std": 0.2335687279701233, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.03486389294266701, "rewards/penalized_accuracy_reward/std": 0.13945557177066803, "rewards/reasoning_steps_reward/mean": 0.942708358168602, "rewards/reasoning_steps_reward/std": 0.14845871925354004, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.11003002151846886, "step": 309 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.75, "completions/max_terminated_length": 705.75, "completions/mean_length": 541.015625, "completions/mean_terminated_length": 541.015625, "completions/min_length": 359.75, "completions/min_terminated_length": 359.75, "epoch": 0.10333333333333333, "grad_norm": 0.6742736101150513, "kl": 0.2041015625, "learning_rate": 1.9999451693655125e-05, "loss": -0.0092, "num_tokens": 15156244.0, "reward": 0.9901554882526398, "reward_std": 0.2408963106572628, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.21039126068353653, "rewards/penalized_accuracy_reward/mean": 0.035337790846824646, "rewards/penalized_accuracy_reward/std": 0.14135116338729858, "rewards/reasoning_steps_reward/mean": 0.9635416865348816, "rewards/reasoning_steps_reward/std": 0.08714327588677406, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.05644455552101135, "step": 310 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.25, "completions/max_terminated_length": 792.25, "completions/mean_length": 565.4375, "completions/mean_terminated_length": 565.4375, "completions/min_length": 398.75, "completions/min_terminated_length": 398.75, "epoch": 0.10366666666666667, "grad_norm": 0.7241008281707764, "kl": 0.27734375, "learning_rate": 1.9999323080037623e-05, "loss": 0.0163, "num_tokens": 15203392.0, "reward": 1.0042553097009659, "reward_std": 0.1819033268839121, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.1875, "rewards/penalized_accuracy_reward/mean": 0.02938549779355526, "rewards/penalized_accuracy_reward/std": 0.11754199117422104, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 311 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.5, "completions/max_terminated_length": 802.5, "completions/mean_length": 581.71875, "completions/mean_terminated_length": 581.71875, "completions/min_length": 346.25, "completions/min_terminated_length": 346.25, "epoch": 0.104, "grad_norm": 0.47812944650650024, "kl": 0.269775390625, "learning_rate": 1.9999180928786113e-05, "loss": 0.006, "num_tokens": 15249166.0, "reward": 1.1379987299442291, "reward_std": 0.26195312198251486, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14060290157794952, "rewards/penalized_accuracy_reward/std": 0.25153645873069763, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 312 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.25, "completions/max_terminated_length": 862.25, "completions/mean_length": 581.5, "completions/mean_terminated_length": 581.5, "completions/min_length": 395.5, "completions/min_terminated_length": 395.5, "epoch": 0.10433333333333333, "grad_norm": 0.5108832120895386, "kl": 0.306640625, "learning_rate": 1.9999025240093045e-05, "loss": 0.033, "num_tokens": 15295694.0, "reward": 1.0145233571529388, "reward_std": 0.11746851913630962, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.028585880994796753, "rewards/penalized_accuracy_reward/std": 0.11434352397918701, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 313 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.25, "completions/max_terminated_length": 888.25, "completions/mean_length": 619.921875, "completions/mean_terminated_length": 619.921875, "completions/min_length": 384.25, "completions/min_terminated_length": 384.25, "epoch": 0.10466666666666667, "grad_norm": 0.4307948350906372, "kl": 0.276611328125, "learning_rate": 1.9998856014169193e-05, "loss": 0.0133, "num_tokens": 15344585.0, "reward": 1.0649471282958984, "reward_std": 0.26679350435733795, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.125, "rewards/penalized_accuracy_reward/mean": 0.07900964096188545, "rewards/penalized_accuracy_reward/std": 0.256163090467453, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 314 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.5, "completions/max_terminated_length": 895.5, "completions/mean_length": 611.375, "completions/mean_terminated_length": 611.375, "completions/min_length": 342.75, "completions/min_terminated_length": 342.75, "epoch": 0.105, "grad_norm": 0.5434288382530212, "kl": 0.29736328125, "learning_rate": 1.9998673251243672e-05, "loss": 0.0095, "num_tokens": 15394177.0, "reward": 1.2838045805692673, "reward_std": 0.5009002275764942, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.14789126068353653, "rewards/penalized_accuracy_reward/mean": 0.3078931048512459, "rewards/penalized_accuracy_reward/std": 0.4622473865747452, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07966844737529755, "step": 315 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 950.75, "completions/max_terminated_length": 863.75, "completions/mean_length": 669.609375, "completions/mean_terminated_length": 648.5461578369141, "completions/min_length": 464.25, "completions/min_terminated_length": 464.25, "epoch": 0.10533333333333333, "grad_norm": 0.8112999796867371, "kl": 0.3037109375, "learning_rate": 1.9998476951563914e-05, "loss": 0.059, "num_tokens": 15450456.0, "reward": 0.8890625238418579, "reward_std": 0.19580427184700966, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.44091323018074036, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.21520674601197243, "step": 316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.25, "completions/max_terminated_length": 716.25, "completions/mean_length": 511.171875, "completions/mean_terminated_length": 511.171875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.10566666666666667, "grad_norm": 0.8981077671051025, "kl": 0.2529296875, "learning_rate": 1.999826711539568e-05, "loss": 0.0537, "num_tokens": 15492371.0, "reward": 0.93831005692482, "reward_std": 0.3836033381521702, "rewards/format_reward/mean": 0.640625, "rewards/format_reward/std": 0.4436737895011902, "rewards/penalized_accuracy_reward/mean": 0.09729444235563278, "rewards/penalized_accuracy_reward/std": 0.20956267416477203, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.24108554422855377, "step": 317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 931.5, "completions/max_terminated_length": 889.0, "completions/mean_length": 634.109375, "completions/mean_terminated_length": 627.4500122070312, "completions/min_length": 356.25, "completions/min_terminated_length": 356.25, "epoch": 0.106, "grad_norm": 0.7437970638275146, "kl": 0.23193359375, "learning_rate": 1.9998043743023056e-05, "loss": 0.0702, "num_tokens": 15543850.0, "reward": 0.8113281279802322, "reward_std": 0.21419205144047737, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.4970766380429268, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.17918536625802517, "step": 318 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.5, "completions/max_terminated_length": 684.5, "completions/mean_length": 528.03125, "completions/mean_terminated_length": 528.03125, "completions/min_length": 383.5, "completions/min_terminated_length": 383.5, "epoch": 0.10633333333333334, "grad_norm": 0.8449734449386597, "kl": 0.2451171875, "learning_rate": 1.9997806834748455e-05, "loss": 0.0074, "num_tokens": 15589388.0, "reward": 0.9253906309604645, "reward_std": 0.15440326184034348, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.35648179799318314, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1257193200290203, "step": 319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.5, "completions/max_terminated_length": 671.5, "completions/mean_length": 475.796875, "completions/mean_terminated_length": 475.796875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.10666666666666667, "grad_norm": 0.7161391377449036, "kl": 0.25830078125, "learning_rate": 1.9997556390892623e-05, "loss": -0.0051, "num_tokens": 15634095.0, "reward": 1.007447510957718, "reward_std": 0.16748694330453873, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.21039126068353653, "rewards/penalized_accuracy_reward/mean": 0.034010034054517746, "rewards/penalized_accuracy_reward/std": 0.13604013621807098, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.05259781517088413, "step": 320 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.75, "completions/max_terminated_length": 623.75, "completions/mean_length": 358.078125, "completions/mean_terminated_length": 358.078125, "completions/min_length": 211.25, "completions/min_terminated_length": 211.25, "epoch": 0.107, "grad_norm": 9.0658597946167, "kl": 0.81982421875, "learning_rate": 1.999729241179462e-05, "loss": 0.1053, "num_tokens": 15666116.0, "reward": 0.9933593720197678, "reward_std": 0.02656250074505806, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 321 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.75, "completions/max_terminated_length": 721.75, "completions/mean_length": 501.75, "completions/mean_terminated_length": 501.75, "completions/min_length": 289.75, "completions/min_terminated_length": 289.75, "epoch": 0.10733333333333334, "grad_norm": 0.9017462730407715, "kl": 0.28466796875, "learning_rate": 1.9997014897811834e-05, "loss": 0.0382, "num_tokens": 15707316.0, "reward": 0.6950409710407257, "reward_std": 0.3689886610955, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.12446802854537964, "rewards/penalized_accuracy_reward/std": 0.34050996601581573, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.6953125, "rewards/tag_count_reward/std": 0.14960849285125732, "step": 322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 554.4375, "completions/mean_terminated_length": 486.5102767944336, "completions/min_length": 317.5, "completions/min_terminated_length": 317.5, "epoch": 0.10766666666666666, "grad_norm": 0.8251973390579224, "kl": 0.3310546875, "learning_rate": 1.9996723849319978e-05, "loss": 0.0938, "num_tokens": 15752624.0, "reward": 0.536979153752327, "reward_std": 0.04579480132088065, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.05810113251209259, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.25458791851997375, "step": 323 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 950.75, "completions/max_terminated_length": 822.25, "completions/mean_length": 559.09375, "completions/mean_terminated_length": 491.10145568847656, "completions/min_length": 292.25, "completions/min_terminated_length": 292.25, "epoch": 0.108, "grad_norm": 0.9178186655044556, "kl": 0.298828125, "learning_rate": 1.9996419266713097e-05, "loss": 0.0633, "num_tokens": 15796566.0, "reward": 0.5134114697575569, "reward_std": 0.09221521113067865, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9166666865348816, "rewards/reasoning_steps_reward/std": 0.17712190747261047, "rewards/tag_count_reward/mean": 0.55078125, "rewards/tag_count_reward/std": 0.24923434294760227, "step": 324 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 960.25, "completions/max_terminated_length": 834.75, "completions/mean_length": 540.5625, "completions/mean_terminated_length": 478.70812225341797, "completions/min_length": 326.5, "completions/min_terminated_length": 326.5, "epoch": 0.10833333333333334, "grad_norm": 1.0782766342163086, "kl": 0.35205078125, "learning_rate": 1.9996101150403543e-05, "loss": 0.1742, "num_tokens": 15842634.0, "reward": 0.548046886920929, "reward_std": 0.042725150007754564, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.04929708316922188, "rewards/tag_count_reward/mean": 0.55859375, "rewards/tag_count_reward/std": 0.24080571345984936, "step": 325 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 934.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 463.09375, "completions/mean_terminated_length": 427.4317092895508, "completions/min_length": 251.25, "completions/min_terminated_length": 251.25, "epoch": 0.10866666666666666, "grad_norm": 0.9316300749778748, "kl": 0.34326171875, "learning_rate": 1.9995769500822007e-05, "loss": -0.0657, "num_tokens": 15881488.0, "reward": 0.5242187529802322, "reward_std": 0.09430638235062361, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9218750298023224, "rewards/reasoning_steps_reward/std": 0.18572967126965523, "rewards/tag_count_reward/mean": 0.6328125, "rewards/tag_count_reward/std": 0.16978351771831512, "step": 326 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.75, "completions/max_terminated_length": 870.75, "completions/mean_length": 579.671875, "completions/mean_terminated_length": 579.671875, "completions/min_length": 337.5, "completions/min_terminated_length": 337.5, "epoch": 0.109, "grad_norm": 0.873878538608551, "kl": 0.31982421875, "learning_rate": 1.999542431841749e-05, "loss": -0.0013, "num_tokens": 15926859.0, "reward": 0.5614583343267441, "reward_std": 0.030561438761651516, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.640625, "rewards/tag_count_reward/std": 0.24114800989627838, "step": 327 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 891.75, "completions/max_terminated_length": 860.0, "completions/mean_length": 568.375, "completions/mean_terminated_length": 557.2075958251953, "completions/min_length": 337.5, "completions/min_terminated_length": 337.5, "epoch": 0.10933333333333334, "grad_norm": 0.7906535863876343, "kl": 0.301025390625, "learning_rate": 1.9995065603657317e-05, "loss": 0.0167, "num_tokens": 15974915.0, "reward": 0.6875823885202408, "reward_std": 0.2568901313934475, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12716573476791382, "rewards/penalized_accuracy_reward/std": 0.22754468023777008, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.65625, "rewards/tag_count_reward/std": 0.21732262521982193, "step": 328 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.75, "completions/max_terminated_length": 756.75, "completions/mean_length": 528.4375, "completions/mean_terminated_length": 528.4375, "completions/min_length": 325.75, "completions/min_terminated_length": 325.75, "epoch": 0.10966666666666666, "grad_norm": 0.8963666558265686, "kl": 0.32958984375, "learning_rate": 1.999469335702714e-05, "loss": -0.0342, "num_tokens": 16015903.0, "reward": 0.5386718809604645, "reward_std": 0.06200896389782429, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9531250149011612, "rewards/reasoning_steps_reward/std": 0.0888747088611126, "rewards/tag_count_reward/mean": 0.62109375, "rewards/tag_count_reward/std": 0.2715306803584099, "step": 329 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 545.5625, "completions/mean_terminated_length": 545.5625, "completions/min_length": 291.25, "completions/min_terminated_length": 291.25, "epoch": 0.11, "grad_norm": 0.860392153263092, "kl": 0.29736328125, "learning_rate": 1.9994307579030925e-05, "loss": -0.0076, "num_tokens": 16058915.0, "reward": 0.5680989325046539, "reward_std": 0.023084016982465982, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.70703125, "rewards/tag_count_reward/std": 0.14216844737529755, "step": 330 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 955.5, "completions/max_terminated_length": 930.5, "completions/mean_length": 696.234375, "completions/mean_terminated_length": 691.7843780517578, "completions/min_length": 459.5, "completions/min_terminated_length": 459.5, "epoch": 0.11033333333333334, "grad_norm": 0.34245073795318604, "kl": 0.267578125, "learning_rate": 1.999390827019096e-05, "loss": -0.0036, "num_tokens": 16112370.0, "reward": 0.5704427063465118, "reward_std": 0.015046583488583565, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.049575019627809525, "step": 331 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 931.25, "completions/max_terminated_length": 926.25, "completions/mean_length": 675.40625, "completions/mean_terminated_length": 667.1205444335938, "completions/min_length": 452.5, "completions/min_terminated_length": 452.5, "epoch": 0.11066666666666666, "grad_norm": 0.736656665802002, "kl": 0.2841796875, "learning_rate": 1.999349543104785e-05, "loss": 0.0279, "num_tokens": 16164204.0, "reward": 0.5726562142372131, "reward_std": 0.007206945912912488, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.07206955552101135, "step": 332 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 866.0, "completions/max_terminated_length": 864.5, "completions/mean_length": 637.90625, "completions/mean_terminated_length": 625.4010467529297, "completions/min_length": 379.25, "completions/min_terminated_length": 379.25, "epoch": 0.111, "grad_norm": 0.7842462062835693, "kl": 0.24560546875, "learning_rate": 1.999306906216052e-05, "loss": 0.034, "num_tokens": 16216166.0, "reward": 0.5722655951976776, "reward_std": 0.0074825764168053865, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.07482585124671459, "step": 333 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 960.25, "completions/max_terminated_length": 860.25, "completions/mean_length": 720.734375, "completions/mean_terminated_length": 653.4479217529297, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.11133333333333334, "grad_norm": 0.5861374735832214, "kl": 0.278564453125, "learning_rate": 1.999262916410621e-05, "loss": 0.1111, "num_tokens": 16272725.0, "reward": 0.5554687529802322, "reward_std": 0.02883127611130476, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.03359273821115494, "rewards/tag_count_reward/mean": 0.6328125, "rewards/tag_count_reward/std": 0.14413107000291348, "step": 334 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.75, "completions/max_terminated_length": 850.75, "completions/mean_length": 604.015625, "completions/mean_terminated_length": 604.015625, "completions/min_length": 425.5, "completions/min_terminated_length": 425.5, "epoch": 0.11166666666666666, "grad_norm": 0.6174341440200806, "kl": 0.27734375, "learning_rate": 1.9992175737480487e-05, "loss": -0.0022, "num_tokens": 16319542.0, "reward": 0.6292968690395355, "reward_std": 0.15099719865247607, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0546875, "rewards/penalized_accuracy_reward/std": 0.14943470060825348, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.015625, "step": 335 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1010.75, "completions/max_terminated_length": 955.5, "completions/mean_length": 762.46875, "completions/mean_terminated_length": 724.74658203125, "completions/min_length": 403.75, "completions/min_terminated_length": 403.75, "epoch": 0.112, "grad_norm": 0.8106991052627563, "kl": 0.32421875, "learning_rate": 1.9991708782897214e-05, "loss": 0.1122, "num_tokens": 16378708.0, "reward": 0.598828136920929, "reward_std": 0.11693336139433086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02734375, "rewards/penalized_accuracy_reward/std": 0.1093750074505806, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.07558366656303406, "step": 336 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 558.296875, "completions/mean_terminated_length": 558.296875, "completions/min_length": 357.25, "completions/min_terminated_length": 357.25, "epoch": 0.11233333333333333, "grad_norm": 0.5837851166725159, "kl": 0.323486328125, "learning_rate": 1.9991228300988586e-05, "loss": -0.0411, "num_tokens": 16422359.0, "reward": 0.5695312321186066, "reward_std": 0.009778629755601287, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6953125, "rewards/tag_count_reward/std": 0.09778633713722229, "step": 337 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.25, "completions/max_terminated_length": 916.25, "completions/mean_length": 701.453125, "completions/mean_terminated_length": 701.453125, "completions/min_length": 493.5, "completions/min_terminated_length": 493.5, "epoch": 0.11266666666666666, "grad_norm": 0.13602034747600555, "kl": 0.297607421875, "learning_rate": 1.9990734292405102e-05, "loss": 0.0119, "num_tokens": 16478340.0, "reward": 0.574999988079071, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 338 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 954.75, "completions/max_terminated_length": 931.25, "completions/mean_length": 700.265625, "completions/mean_terminated_length": 695.4854278564453, "completions/min_length": 471.25, "completions/min_terminated_length": 471.25, "epoch": 0.113, "grad_norm": 0.576396107673645, "kl": 0.301025390625, "learning_rate": 1.9990226757815582e-05, "loss": 0.0687, "num_tokens": 16533589.0, "reward": 0.5742187201976776, "reward_std": 0.0031249960884451866, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.03125, "step": 339 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 939.5, "completions/max_terminated_length": 937.25, "completions/mean_length": 723.921875, "completions/mean_terminated_length": 719.9479217529297, "completions/min_length": 458.5, "completions/min_terminated_length": 458.5, "epoch": 0.11333333333333333, "grad_norm": 0.5036214590072632, "kl": 0.281982421875, "learning_rate": 1.998970569790715e-05, "loss": -0.0162, "num_tokens": 16591232.0, "reward": 0.5712239295244217, "reward_std": 0.013554674573242664, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.046875, "step": 340 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.75, "completions/max_terminated_length": 874.75, "completions/mean_length": 679.328125, "completions/mean_terminated_length": 679.328125, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.11366666666666667, "grad_norm": 0.11010044068098068, "kl": 0.271240234375, "learning_rate": 1.998917111338525e-05, "loss": 0.0109, "num_tokens": 16645077.0, "reward": 0.574999988079071, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 341 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.5, "completions/max_terminated_length": 869.5, "completions/mean_length": 642.3125, "completions/mean_terminated_length": 642.3125, "completions/min_length": 451.25, "completions/min_terminated_length": 451.25, "epoch": 0.114, "grad_norm": 0.4419842064380646, "kl": 0.294189453125, "learning_rate": 1.9988623004973625e-05, "loss": 0.0146, "num_tokens": 16696777.0, "reward": 0.6437411606311798, "reward_std": 0.20553494337946177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07134534418582916, "rewards/penalized_accuracy_reward/std": 0.1951182782649994, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 342 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.75, "completions/max_terminated_length": 933.75, "completions/mean_length": 598.890625, "completions/mean_terminated_length": 598.890625, "completions/min_length": 427.25, "completions/min_terminated_length": 427.25, "epoch": 0.11433333333333333, "grad_norm": 0.3295036852359772, "kl": 0.345703125, "learning_rate": 1.9988061373414342e-05, "loss": 0.0098, "num_tokens": 16746962.0, "reward": 0.5746093541383743, "reward_std": 0.0015624980442225933, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.015625, "step": 343 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 693.859375, "completions/mean_terminated_length": 693.859375, "completions/min_length": 460.75, "completions/min_terminated_length": 460.75, "epoch": 0.11466666666666667, "grad_norm": 0.6154027581214905, "kl": 0.265380859375, "learning_rate": 1.9987486219467764e-05, "loss": 0.0231, "num_tokens": 16803385.0, "reward": 0.5977044999599457, "reward_std": 0.17415135446935892, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03312116861343384, "rewards/penalized_accuracy_reward/std": 0.13248467445373535, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.0833333320915699, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 344 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 877.25, "completions/max_terminated_length": 863.0, "completions/mean_length": 706.3125, "completions/mean_terminated_length": 695.7043304443359, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.115, "grad_norm": 0.567071795463562, "kl": 0.263427734375, "learning_rate": 1.998689754391257e-05, "loss": 0.0459, "num_tokens": 16859837.0, "reward": 0.6614129096269608, "reward_std": 0.2113286810927093, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09097020328044891, "rewards/penalized_accuracy_reward/std": 0.19595451653003693, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.049575019627809525, "step": 345 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 850.75, "completions/max_terminated_length": 848.75, "completions/mean_length": 662.859375, "completions/mean_terminated_length": 659.011474609375, "completions/min_length": 454.25, "completions/min_terminated_length": 454.25, "epoch": 0.11533333333333333, "grad_norm": 0.44871285557746887, "kl": 0.302978515625, "learning_rate": 1.9986295347545738e-05, "loss": 0.0135, "num_tokens": 16911460.0, "reward": 0.5993489623069763, "reward_std": 0.11980467848479748, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02734375, "rewards/penalized_accuracy_reward/std": 0.1093750074505806, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.015625, "step": 346 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.5, "completions/max_terminated_length": 906.5, "completions/mean_length": 697.1875, "completions/mean_terminated_length": 697.1875, "completions/min_length": 495.5, "completions/min_terminated_length": 495.5, "epoch": 0.11566666666666667, "grad_norm": 0.30112141370773315, "kl": 0.268310546875, "learning_rate": 1.998567963118256e-05, "loss": 0.0053, "num_tokens": 16964912.0, "reward": 0.739340677857399, "reward_std": 0.25176531076431274, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16434067487716675, "rewards/penalized_accuracy_reward/std": 0.25176534056663513, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 347 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 970.75, "completions/max_terminated_length": 961.0, "completions/mean_length": 784.640625, "completions/mean_terminated_length": 775.7781524658203, "completions/min_length": 548.25, "completions/min_terminated_length": 548.25, "epoch": 0.116, "grad_norm": 0.5412817001342773, "kl": 0.266357421875, "learning_rate": 1.9985050395656617e-05, "loss": 0.0267, "num_tokens": 17027577.0, "reward": 0.7146791964769363, "reward_std": 0.3511555069126189, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14436671882867813, "rewards/penalized_accuracy_reward/std": 0.3467045724391937, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.703125, "rewards/tag_count_reward/std": 0.15779344737529755, "step": 348 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 916.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 709.125, "completions/mean_terminated_length": 705.4073028564453, "completions/min_length": 507.5, "completions/min_terminated_length": 507.5, "epoch": 0.11633333333333333, "grad_norm": 0.5882188677787781, "kl": 0.2978515625, "learning_rate": 1.9984407641819812e-05, "loss": 0.0179, "num_tokens": 17082321.0, "reward": 0.7354599088430405, "reward_std": 0.25636449502781034, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16280364990234375, "rewards/penalized_accuracy_reward/std": 0.24956567585468292, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.06798820197582245, "step": 349 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 995.0, "completions/max_terminated_length": 978.5, "completions/mean_length": 765.28125, "completions/mean_terminated_length": 745.8104248046875, "completions/min_length": 524.25, "completions/min_terminated_length": 524.25, "epoch": 0.11666666666666667, "grad_norm": 0.5826079845428467, "kl": 0.281494140625, "learning_rate": 1.9983751370542334e-05, "loss": 0.0384, "num_tokens": 17140627.0, "reward": 0.5707031190395355, "reward_std": 0.013732579769566655, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.70703125, "rewards/tag_count_reward/std": 0.1373258512467146, "step": 350 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 951.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 702.984375, "completions/mean_terminated_length": 699.2343902587891, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 0.117, "grad_norm": 0.4956066906452179, "kl": 0.2626953125, "learning_rate": 1.9983081582712684e-05, "loss": -0.0183, "num_tokens": 17196450.0, "reward": 0.5630208253860474, "reward_std": 0.0379206258803606, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.06615880131721497, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.04841229319572449, "step": 351 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 960.0, "completions/max_terminated_length": 908.5, "completions/mean_length": 751.046875, "completions/mean_terminated_length": 724.80419921875, "completions/min_length": 485.75, "completions/min_terminated_length": 485.75, "epoch": 0.11733333333333333, "grad_norm": 0.6335891485214233, "kl": 0.266357421875, "learning_rate": 1.9982398279237657e-05, "loss": 0.0123, "num_tokens": 17256325.0, "reward": 0.6546921133995056, "reward_std": 0.29282613936811686, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08789525181055069, "rewards/penalized_accuracy_reward/std": 0.28226570785045624, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.66796875, "rewards/tag_count_reward/std": 0.21457599848508835, "step": 352 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1008.25, "completions/max_terminated_length": 989.25, "completions/mean_length": 836.96875, "completions/mean_terminated_length": 810.9872283935547, "completions/min_length": 608.75, "completions/min_terminated_length": 608.75, "epoch": 0.11766666666666667, "grad_norm": 0.5203900933265686, "kl": 0.265380859375, "learning_rate": 1.998170146104234e-05, "loss": -0.0027, "num_tokens": 17318691.0, "reward": 0.5458333343267441, "reward_std": 0.07285358663648367, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9635416865348816, "rewards/reasoning_steps_reward/std": 0.12865879759192467, "rewards/tag_count_reward/mean": 0.640625, "rewards/tag_count_reward/std": 0.1502092145383358, "step": 353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 995.25, "completions/max_terminated_length": 942.0, "completions/mean_length": 791.03125, "completions/mean_terminated_length": 752.4354553222656, "completions/min_length": 507.5, "completions/min_terminated_length": 507.5, "epoch": 0.118, "grad_norm": 0.53293377161026, "kl": 0.2705078125, "learning_rate": 1.998099112907013e-05, "loss": -0.0189, "num_tokens": 17379429.0, "reward": 0.535026028752327, "reward_std": 0.11204528529196978, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.927083358168602, "rewards/reasoning_steps_reward/std": 0.22220106050372124, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.07449322193861008, "step": 354 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 997.75, "completions/max_terminated_length": 934.25, "completions/mean_length": 775.90625, "completions/mean_terminated_length": 751.3139343261719, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 0.11833333333333333, "grad_norm": 0.6207841634750366, "kl": 0.26513671875, "learning_rate": 1.9980267284282718e-05, "loss": 0.0175, "num_tokens": 17440223.0, "reward": 0.5654947757720947, "reward_std": 0.028851188253611326, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.70703125, "rewards/tag_count_reward/std": 0.09567352384328842, "step": 355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 966.0, "completions/max_terminated_length": 952.75, "completions/mean_length": 815.171875, "completions/mean_terminated_length": 801.4709930419922, "completions/min_length": 624.75, "completions/min_terminated_length": 624.75, "epoch": 0.11866666666666667, "grad_norm": 0.34053659439086914, "kl": 0.239990234375, "learning_rate": 1.9979529927660076e-05, "loss": 0.0033, "num_tokens": 17502378.0, "reward": 0.5493489503860474, "reward_std": 0.05448907986283302, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9635416716337204, "rewards/reasoning_steps_reward/std": 0.10987519100308418, "rewards/tag_count_reward/mean": 0.67578125, "rewards/tag_count_reward/std": 0.08297448605298996, "step": 356 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1005.5, "completions/max_terminated_length": 979.75, "completions/mean_length": 873.84375, "completions/mean_terminated_length": 856.6114654541016, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "epoch": 0.119, "grad_norm": 0.5661823749542236, "kl": 0.26953125, "learning_rate": 1.9978779060200483e-05, "loss": 0.0126, "num_tokens": 17568848.0, "reward": 0.7227516770362854, "reward_std": 0.36486852215602994, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16168397665023804, "rewards/penalized_accuracy_reward/std": 0.34969785064458847, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.63671875, "rewards/tag_count_reward/std": 0.09947755187749863, "step": 357 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.25, "completions/mean_length": 912.40625, "completions/mean_terminated_length": 845.8678894042969, "completions/min_length": 715.75, "completions/min_terminated_length": 715.75, "epoch": 0.11933333333333333, "grad_norm": 0.5193860530853271, "kl": 0.23828125, "learning_rate": 1.9978014682920503e-05, "loss": 0.0729, "num_tokens": 17636410.0, "reward": 0.5519531220197678, "reward_std": 0.023164119804278016, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.51953125, "rewards/tag_count_reward/std": 0.23164120875298977, "step": 358 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 811.34375, "completions/mean_terminated_length": 788.9231262207031, "completions/min_length": 602.5, "completions/min_terminated_length": 602.5, "epoch": 0.11966666666666667, "grad_norm": 0.6877517104148865, "kl": 0.2841796875, "learning_rate": 1.997723679685499e-05, "loss": 0.0868, "num_tokens": 17700176.0, "reward": 0.5710937082767487, "reward_std": 0.01112984400242567, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7109375, "rewards/tag_count_reward/std": 0.11129852384328842, "step": 359 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 882.5, "completions/max_terminated_length": 880.75, "completions/mean_length": 619.78125, "completions/mean_terminated_length": 615.1302185058594, "completions/min_length": 470.25, "completions/min_terminated_length": 470.25, "epoch": 0.12, "grad_norm": 0.6614722013473511, "kl": 0.2841796875, "learning_rate": 1.9976445403057095e-05, "loss": 0.0213, "num_tokens": 17753586.0, "reward": 0.568229153752327, "reward_std": 0.025533841457217932, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.0625, "step": 360 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1017.0, "completions/max_terminated_length": 977.5, "completions/mean_length": 794.515625, "completions/mean_terminated_length": 777.9619293212891, "completions/min_length": 543.25, "completions/min_terminated_length": 543.25, "epoch": 0.12033333333333333, "grad_norm": 0.587173342704773, "kl": 0.2568359375, "learning_rate": 1.9975640502598243e-05, "loss": 0.0382, "num_tokens": 17813075.0, "reward": 0.7127092778682709, "reward_std": 0.27483612578362226, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14942806959152222, "rewards/penalized_accuracy_reward/std": 0.2673214375972748, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.0624999962747097, "rewards/tag_count_reward/mean": 0.7109375, "rewards/tag_count_reward/std": 0.12654344737529755, "step": 361 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 812.25, "completions/max_terminated_length": 802.0, "completions/mean_length": 624.359375, "completions/mean_terminated_length": 620.5625152587891, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.12066666666666667, "grad_norm": 0.578199565410614, "kl": 0.265380859375, "learning_rate": 1.9974822096568157e-05, "loss": 0.0284, "num_tokens": 17861130.0, "reward": 0.7899739593267441, "reward_std": 0.375191253144294, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.21875, "rewards/penalized_accuracy_reward/std": 0.3681847155094147, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.046875, "step": 362 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 839.5, "completions/max_terminated_length": 809.0, "completions/mean_length": 624.171875, "completions/mean_terminated_length": 610.0516967773438, "completions/min_length": 415.25, "completions/min_terminated_length": 415.25, "epoch": 0.121, "grad_norm": 0.5601195693016052, "kl": 0.2763671875, "learning_rate": 1.9973990186074844e-05, "loss": 0.0437, "num_tokens": 17911253.0, "reward": 0.5708333253860474, "reward_std": 0.012949130265042186, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.04081955552101135, "step": 363 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.75, "completions/max_terminated_length": 754.75, "completions/mean_length": 596.171875, "completions/mean_terminated_length": 596.171875, "completions/min_length": 450.75, "completions/min_terminated_length": 450.75, "epoch": 0.12133333333333333, "grad_norm": 0.6716346144676208, "kl": 0.2734375, "learning_rate": 1.997314477224458e-05, "loss": 0.0063, "num_tokens": 17961216.0, "reward": 0.55859375, "reward_std": 0.05590894632041454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9687500298023224, "rewards/reasoning_steps_reward/std": 0.11179707944393158, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.021347815170884132, "step": 364 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.25, "completions/max_terminated_length": 684.25, "completions/mean_length": 487.234375, "completions/mean_terminated_length": 487.234375, "completions/min_length": 359.25, "completions/min_terminated_length": 359.25, "epoch": 0.12166666666666667, "grad_norm": 0.4668358266353607, "kl": 0.3193359375, "learning_rate": 1.9972285856221944e-05, "loss": 0.0521, "num_tokens": 18000831.0, "reward": 0.7272274792194366, "reward_std": 0.37441834807395935, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1626441404223442, "rewards/penalized_accuracy_reward/std": 0.3894267678260803, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.048112526535987854, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.0, "step": 365 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.5, "completions/max_terminated_length": 586.5, "completions/mean_length": 485.96875, "completions/mean_terminated_length": 485.96875, "completions/min_length": 339.75, "completions/min_terminated_length": 339.75, "epoch": 0.122, "grad_norm": 0.751215398311615, "kl": 0.33203125, "learning_rate": 1.9971413439169777e-05, "loss": 0.0108, "num_tokens": 18044077.0, "reward": 0.6277906894683838, "reward_std": 0.22004716284573078, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06516048312187195, "rewards/penalized_accuracy_reward/std": 0.17814074456691742, "rewards/reasoning_steps_reward/mean": 0.9791666865348816, "rewards/reasoning_steps_reward/std": 0.0833333283662796, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.078125, "step": 366 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.25, "completions/max_terminated_length": 613.25, "completions/mean_length": 449.234375, "completions/mean_terminated_length": 449.234375, "completions/min_length": 348.75, "completions/min_terminated_length": 348.75, "epoch": 0.12233333333333334, "grad_norm": 0.7573283314704895, "kl": 0.361328125, "learning_rate": 1.9970527522269204e-05, "loss": 0.0358, "num_tokens": 18084364.0, "reward": 0.5644531100988388, "reward_std": 0.03857939247973263, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.08957063034176826, "step": 367 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 738.0, "completions/max_terminated_length": 636.75, "completions/mean_length": 489.0625, "completions/mean_terminated_length": 480.8500061035156, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.12266666666666666, "grad_norm": 0.8003387451171875, "kl": 0.3837890625, "learning_rate": 1.9969628106719632e-05, "loss": 0.0422, "num_tokens": 18124912.0, "reward": 0.5933363288640976, "reward_std": 0.13590667862445116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02862280048429966, "rewards/penalized_accuracy_reward/std": 0.11449120938777924, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.028463751077651978, "rewards/tag_count_reward/mean": 0.69921875, "rewards/tag_count_reward/std": 0.13621489331126213, "step": 368 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 782.0, "completions/max_terminated_length": 732.25, "completions/mean_length": 533.4375, "completions/mean_terminated_length": 524.9697952270508, "completions/min_length": 367.25, "completions/min_terminated_length": 367.25, "epoch": 0.123, "grad_norm": 0.5432648062705994, "kl": 0.32763671875, "learning_rate": 1.9968715193738738e-05, "loss": 0.0231, "num_tokens": 18168988.0, "reward": 0.6612445116043091, "reward_std": 0.28088031709194183, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0874163992702961, "rewards/penalized_accuracy_reward/std": 0.28035029768943787, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.046875, "step": 369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 870.25, "completions/max_terminated_length": 714.75, "completions/mean_length": 544.421875, "completions/mean_terminated_length": 522.0605773925781, "completions/min_length": 387.5, "completions/min_terminated_length": 387.5, "epoch": 0.12333333333333334, "grad_norm": 0.9099456667900085, "kl": 0.35009765625, "learning_rate": 1.9967788784562474e-05, "loss": 0.1243, "num_tokens": 18214103.0, "reward": 0.5684895664453506, "reward_std": 0.02152151893824339, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.7109375, "rewards/tag_count_reward/std": 0.12654344737529755, "step": 370 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 823.5, "completions/max_terminated_length": 763.5, "completions/mean_length": 540.8125, "completions/mean_terminated_length": 526.511474609375, "completions/min_length": 351.75, "completions/min_terminated_length": 351.75, "epoch": 0.12366666666666666, "grad_norm": 0.8498043417930603, "kl": 0.3798828125, "learning_rate": 1.996684888044506e-05, "loss": 0.0597, "num_tokens": 18257963.0, "reward": 0.5631510317325592, "reward_std": 0.0337344182189554, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.68359375, "rewards/tag_count_reward/std": 0.16335688158869743, "step": 371 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 723.0, "completions/max_terminated_length": 599.25, "completions/mean_length": 525.84375, "completions/mean_terminated_length": 491.234375, "completions/min_length": 404.75, "completions/min_terminated_length": 404.75, "epoch": 0.124, "grad_norm": 1.7754876613616943, "kl": 0.46435546875, "learning_rate": 1.9965895482659e-05, "loss": 0.1219, "num_tokens": 18300225.0, "reward": 0.5675781071186066, "reward_std": 0.01923357043415308, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.67578125, "rewards/tag_count_reward/std": 0.19233575090765953, "step": 372 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 848.25, "completions/mean_length": 606.40625, "completions/mean_terminated_length": 537.673095703125, "completions/min_length": 349.5, "completions/min_terminated_length": 349.5, "epoch": 0.12433333333333334, "grad_norm": 1.5454201698303223, "kl": 0.68017578125, "learning_rate": 1.9964928592495046e-05, "loss": 0.0783, "num_tokens": 18348267.0, "reward": 0.6182190477848053, "reward_std": 0.28114900551736355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06665654852986336, "rewards/penalized_accuracy_reward/std": 0.26662619411945343, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.515625, "rewards/tag_count_reward/std": 0.30157821998000145, "step": 373 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 588.75, "completions/mean_length": 739.40625, "completions/mean_terminated_length": 440.1388931274414, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.12466666666666666, "grad_norm": 4.628246784210205, "kl": 0.96484375, "learning_rate": 1.9963948211262233e-05, "loss": 0.3434, "num_tokens": 18404981.0, "reward": 0.5234375149011612, "reward_std": 0.032967695500701666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.234375, "rewards/tag_count_reward/std": 0.3296769931912422, "step": 374 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 674.5, "completions/mean_length": 785.046875, "completions/mean_terminated_length": 472.26390075683594, "completions/min_length": 319.25, "completions/min_terminated_length": 319.25, "epoch": 0.125, "grad_norm": 5.922732830047607, "kl": 2.6875, "learning_rate": 1.996295434028785e-05, "loss": 0.303, "num_tokens": 18464216.0, "reward": 0.5171875059604645, "reward_std": 0.02666703937575221, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.171875, "rewards/tag_count_reward/std": 0.26667042449116707, "step": 375 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 871.25, "completions/mean_terminated_length": 375.8041763305664, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.12533333333333332, "grad_norm": 17.34748077392578, "kl": 13.953125, "learning_rate": 1.9961946980917457e-05, "loss": 0.7341, "num_tokens": 18531832.0, "reward": 0.5095052123069763, "reward_std": 0.02496273792348802, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.12109375, "rewards/tag_count_reward/std": 0.21139980107545853, "step": 376 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 1024.0, "completions/max_terminated_length": 379.25, "completions/mean_length": 859.453125, "completions/mean_terminated_length": 333.0506057739258, "completions/min_length": 272.5, "completions/min_terminated_length": 272.5, "epoch": 0.12566666666666668, "grad_norm": 5.47519588470459, "kl": 5.6015625, "learning_rate": 1.9960926134514875e-05, "loss": 0.4119, "num_tokens": 18597093.0, "reward": 0.5238281190395355, "reward_std": 0.0280300946906209, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.23828125, "rewards/tag_count_reward/std": 0.28030097112059593, "step": 377 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 616.75, "completions/mean_length": 892.125, "completions/mean_terminated_length": 459.1666793823242, "completions/min_length": 331.75, "completions/min_terminated_length": 331.75, "epoch": 0.126, "grad_norm": 1.6290124654769897, "kl": 1.328125, "learning_rate": 1.995989180246218e-05, "loss": 0.1209, "num_tokens": 18666829.0, "reward": 0.583984375, "reward_std": 0.2336447136476636, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0546875, "rewards/penalized_accuracy_reward/std": 0.2187500149011612, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.29296875, "rewards/tag_count_reward/std": 0.3123226538300514, "step": 378 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 748.75, "completions/mean_length": 807.015625, "completions/mean_terminated_length": 535.275016784668, "completions/min_length": 323.5, "completions/min_terminated_length": 323.5, "epoch": 0.12633333333333333, "grad_norm": 4.420656681060791, "kl": 0.716796875, "learning_rate": 1.9958843986159705e-05, "loss": 0.2669, "num_tokens": 18729774.0, "reward": 0.542187511920929, "reward_std": 0.031155919656157494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.31155921518802643, "step": 379 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 650.5, "completions/mean_length": 921.90625, "completions/mean_terminated_length": 532.9642944335938, "completions/min_length": 427.75, "completions/min_terminated_length": 427.75, "epoch": 0.12666666666666668, "grad_norm": 3.003019332885742, "kl": 1.05859375, "learning_rate": 1.9957782687026046e-05, "loss": 0.1814, "num_tokens": 18800120.0, "reward": 0.5316406339406967, "reward_std": 0.028903153259307146, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.31640625, "rewards/tag_count_reward/std": 0.2890315502882004, "step": 380 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 1024.0, "completions/max_terminated_length": 297.5, "completions/mean_length": 985.6875, "completions/mean_terminated_length": 285.875, "completions/min_length": 530.25, "completions/min_terminated_length": 274.25, "epoch": 0.127, "grad_norm": 2.79579758644104, "kl": 1.951171875, "learning_rate": 1.9956707906498046e-05, "loss": 0.1413, "num_tokens": 18871812.0, "reward": 0.5439791083335876, "reward_std": 0.1430248417891562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.031479090452194214, "rewards/penalized_accuracy_reward/std": 0.12591636180877686, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.125, "rewards/tag_count_reward/std": 0.2432589866220951, "step": 381 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.25, "completions/mean_length": 770.203125, "completions/mean_terminated_length": 614.058349609375, "completions/min_length": 251.25, "completions/min_terminated_length": 251.25, "epoch": 0.12733333333333333, "grad_norm": 4.434508800506592, "kl": 1.71875, "learning_rate": 1.99556196460308e-05, "loss": 0.2662, "num_tokens": 18929281.0, "reward": 0.5570312291383743, "reward_std": 0.026602684520184994, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5703125, "rewards/tag_count_reward/std": 0.26602689176797867, "step": 382 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 992.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 495.375, "completions/mean_terminated_length": 461.93975830078125, "completions/min_length": 225.75, "completions/min_terminated_length": 225.75, "epoch": 0.12766666666666668, "grad_norm": 3.1257944107055664, "kl": 2.83984375, "learning_rate": 1.9954517907097663e-05, "loss": 0.2395, "num_tokens": 18970041.0, "reward": 0.5675781071186066, "reward_std": 0.017155689420178533, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.67578125, "rewards/tag_count_reward/std": 0.17155694775283337, "step": 383 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 899.0, "completions/max_terminated_length": 882.5, "completions/mean_length": 484.734375, "completions/mean_terminated_length": 474.9291687011719, "completions/min_length": 206.5, "completions/min_terminated_length": 206.5, "epoch": 0.128, "grad_norm": 4.114688396453857, "kl": 2.6572265625, "learning_rate": 1.9953402691190218e-05, "loss": 0.1518, "num_tokens": 19010744.0, "reward": 0.5683593451976776, "reward_std": 0.013292336370795965, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.68359375, "rewards/tag_count_reward/std": 0.1329234316945076, "step": 384 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1022.0, "completions/max_terminated_length": 944.5, "completions/mean_length": 647.09375, "completions/mean_terminated_length": 613.0419769287109, "completions/min_length": 319.75, "completions/min_terminated_length": 319.75, "epoch": 0.12833333333333333, "grad_norm": 2.3895230293273926, "kl": 0.8369140625, "learning_rate": 1.9952273999818312e-05, "loss": 0.14, "num_tokens": 19062798.0, "reward": 0.5722656100988388, "reward_std": 0.006789007456973195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.06789018586277962, "step": 385 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1017.25, "completions/max_terminated_length": 948.25, "completions/mean_length": 666.53125, "completions/mean_terminated_length": 637.3544921875, "completions/min_length": 382.75, "completions/min_terminated_length": 382.75, "epoch": 0.12866666666666668, "grad_norm": 4.826654434204102, "kl": 2.22265625, "learning_rate": 1.9951131834510034e-05, "loss": 0.2538, "num_tokens": 19114320.0, "reward": 0.567187488079071, "reward_std": 0.022259699180722237, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.671875, "rewards/tag_count_reward/std": 0.22259704768657684, "step": 386 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 863.5, "completions/mean_length": 737.1875, "completions/mean_terminated_length": 614.6210479736328, "completions/min_length": 395.25, "completions/min_terminated_length": 395.25, "epoch": 0.129, "grad_norm": 18.4852294921875, "kl": 16.25, "learning_rate": 1.99499761968117e-05, "loss": 0.8592, "num_tokens": 19170492.0, "reward": 0.5429687350988388, "reward_std": 0.04709636978805065, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.03359273821115494, "rewards/tag_count_reward/mean": 0.5078125, "rewards/tag_count_reward/std": 0.33248988538980484, "step": 387 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 808.25, "completions/mean_length": 687.9375, "completions/mean_terminated_length": 549.6305084228516, "completions/min_length": 338.25, "completions/min_terminated_length": 338.25, "epoch": 0.12933333333333333, "grad_norm": 10.924671173095703, "kl": 12.734375, "learning_rate": 1.9948807088287884e-05, "loss": 0.7275, "num_tokens": 19224488.0, "reward": 0.5419270843267441, "reward_std": 0.0634591830894351, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9791666865348816, "rewards/reasoning_steps_reward/std": 0.0833333283662796, "rewards/tag_count_reward/mean": 0.5234375, "rewards/tag_count_reward/std": 0.31878840923309326, "step": 388 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.25, "completions/mean_length": 768.328125, "completions/mean_terminated_length": 704.0210876464844, "completions/min_length": 384.75, "completions/min_terminated_length": 384.75, "epoch": 0.12966666666666668, "grad_norm": 2.4064109325408936, "kl": 1.265625, "learning_rate": 1.9947624510521385e-05, "loss": 0.1485, "num_tokens": 19282669.0, "reward": 0.5677083283662796, "reward_std": 0.017858162289485335, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.703125, "rewards/tag_count_reward/std": 0.09968777745962143, "step": 389 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 966.0, "completions/max_terminated_length": 955.25, "completions/mean_length": 700.609375, "completions/mean_terminated_length": 683.7254638671875, "completions/min_length": 413.5, "completions/min_terminated_length": 413.5, "epoch": 0.13, "grad_norm": 2.1699938774108887, "kl": 0.865234375, "learning_rate": 1.9946428465113244e-05, "loss": 0.0691, "num_tokens": 19340228.0, "reward": 0.5976562649011612, "reward_std": 0.12052598781883717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02734375, "rewards/penalized_accuracy_reward/std": 0.1093750074505806, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.703125, "rewards/tag_count_reward/std": 0.1259822454303503, "step": 390 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 1024.0, "completions/max_terminated_length": 210.25, "completions/mean_length": 989.140625, "completions/mean_terminated_length": 210.25, "completions/min_length": 466.25, "completions/min_terminated_length": 210.25, "epoch": 0.13033333333333333, "grad_norm": 10.679924964904785, "kl": 7.421875, "learning_rate": 1.9945218953682736e-05, "loss": 0.2918, "num_tokens": 19413789.0, "reward": 0.5009114369750023, "reward_std": 0.05319773964583874, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9635417014360428, "rewards/reasoning_steps_reward/std": 0.10622458532452583, "rewards/tag_count_reward/mean": 0.19140625, "rewards/tag_count_reward/std": 0.17257864400744438, "step": 391 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.13066666666666665, "grad_norm": 6.155830383300781, "kl": 3.9375, "learning_rate": 1.9943995977867358e-05, "loss": 0.1575, "num_tokens": 19489197.0, "reward": 0.48684895783662796, "reward_std": 0.08839214779436588, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9479166865348816, "rewards/reasoning_steps_reward/std": 0.17237518727779388, "rewards/tag_count_reward/mean": 0.12890625, "rewards/tag_count_reward/std": 0.1510934755206108, "step": 392 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.131, "grad_norm": 2.1352784633636475, "kl": 1.34765625, "learning_rate": 1.9942759539322845e-05, "loss": 0.0539, "num_tokens": 19563085.0, "reward": 0.4867187440395355, "reward_std": 0.09090410731732845, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9531250298023224, "rewards/reasoning_steps_reward/std": 0.17429707944393158, "rewards/tag_count_reward/mean": 0.1015625, "rewards/tag_count_reward/std": 0.16582809947431087, "step": 393 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.13133333333333333, "grad_norm": 0.7536581754684448, "kl": 0.5791015625, "learning_rate": 1.9941509639723155e-05, "loss": 0.0232, "num_tokens": 19640941.0, "reward": 0.510937511920929, "reward_std": 0.013339085271582007, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.109375, "rewards/tag_count_reward/std": 0.13339098542928696, "step": 394 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.13166666666666665, "grad_norm": 0.6516260504722595, "kl": 0.5595703125, "learning_rate": 1.9940246280760473e-05, "loss": 0.0224, "num_tokens": 19716349.0, "reward": 0.5140624940395355, "reward_std": 0.017406899016350508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.140625, "rewards/tag_count_reward/std": 0.1740690991282463, "step": 395 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.132, "grad_norm": 1.2135748863220215, "kl": 0.5634765625, "learning_rate": 1.99389694641452e-05, "loss": 0.0225, "num_tokens": 19793085.0, "reward": 0.5101562589406967, "reward_std": 0.015438517788425088, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1015625, "rewards/tag_count_reward/std": 0.15438531525433064, "step": 396 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.13233333333333333, "grad_norm": 0.4990180432796478, "kl": 0.27490234375, "learning_rate": 1.9937679191605964e-05, "loss": 0.011, "num_tokens": 19868333.0, "reward": 0.46757811307907104, "reward_std": 0.08261131285689771, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9218750149011612, "rewards/reasoning_steps_reward/std": 0.15227919816970825, "rewards/tag_count_reward/mean": 0.06640625, "rewards/tag_count_reward/std": 0.10607585124671459, "step": 397 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.13266666666666665, "grad_norm": 0.3590187132358551, "kl": 0.30517578125, "learning_rate": 1.9936375464889608e-05, "loss": 0.0122, "num_tokens": 19942237.0, "reward": 0.4924479126930237, "reward_std": 0.05821423279121518, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.1041666641831398, "rewards/tag_count_reward/mean": 0.0546875, "rewards/tag_count_reward/std": 0.10298692621290684, "step": 398 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 36.25, "completions/mean_length": 1010.265625, "completions/mean_terminated_length": 36.25, "completions/min_length": 804.25, "completions/min_terminated_length": 36.25, "epoch": 0.133, "grad_norm": 0.419431209564209, "kl": 0.32568359375, "learning_rate": 1.9935058285761185e-05, "loss": -0.0386, "num_tokens": 20017102.0, "reward": 0.4997395724058151, "reward_std": 0.013995711575262249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.0234375, "rewards/tag_count_reward/std": 0.07206955552101135, "step": 399 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.13333333333333333, "grad_norm": 0.6517958641052246, "kl": 0.4580078125, "learning_rate": 1.9933727656003964e-05, "loss": 0.0183, "num_tokens": 20094958.0, "reward": 0.5682201683521271, "reward_std": 0.16847115964628756, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.057673290371894836, "rewards/penalized_accuracy_reward/std": 0.15761536359786987, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.10546875, "rewards/tag_count_reward/std": 0.15373114496469498, "step": 400 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 1024.0, "completions/max_terminated_length": 72.75, "completions/mean_length": 988.0, "completions/mean_terminated_length": 64.0, "completions/min_length": 826.75, "completions/min_terminated_length": 58.75, "epoch": 0.13366666666666666, "grad_norm": 0.6319084167480469, "kl": 0.521484375, "learning_rate": 1.9932383577419432e-05, "loss": 0.0716, "num_tokens": 20170494.0, "reward": 0.5425781011581421, "reward_std": 0.12124298885464668, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02734375, "rewards/penalized_accuracy_reward/std": 0.1093750074505806, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.15234375, "rewards/tag_count_reward/std": 0.14299625158309937, "step": 401 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 1024.0, "completions/max_terminated_length": 851.75, "completions/mean_length": 848.671875, "completions/mean_terminated_length": 644.4384002685547, "completions/min_length": 446.75, "completions/min_terminated_length": 446.75, "epoch": 0.134, "grad_norm": 0.7354158163070679, "kl": 0.50830078125, "learning_rate": 1.993102605182727e-05, "loss": 0.127, "num_tokens": 20238521.0, "reward": 0.5234374850988388, "reward_std": 0.018410819116979837, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.234375, "rewards/tag_count_reward/std": 0.18410814180970192, "step": 402 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.75, "completions/max_terminated_length": 953.75, "completions/mean_length": 533.84375, "completions/mean_terminated_length": 533.84375, "completions/min_length": 180.25, "completions/min_terminated_length": 180.25, "epoch": 0.13433333333333333, "grad_norm": 0.7467077374458313, "kl": 0.4462890625, "learning_rate": 1.992965508106537e-05, "loss": -0.115, "num_tokens": 20285247.0, "reward": 0.5534554272890091, "reward_std": 0.17384058702737093, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.031710632145404816, "rewards/penalized_accuracy_reward/std": 0.12684252858161926, "rewards/reasoning_steps_reward/mean": 0.973958358168602, "rewards/reasoning_steps_reward/std": 0.1041666604578495, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.13082130625844002, "step": 403 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.25, "completions/mean_length": 858.03125, "completions/mean_terminated_length": 764.8402862548828, "completions/min_length": 512.5, "completions/min_terminated_length": 512.5, "epoch": 0.13466666666666666, "grad_norm": 0.7295824885368347, "kl": 0.3515625, "learning_rate": 1.9928270666989835e-05, "loss": 0.1089, "num_tokens": 20349473.0, "reward": 0.5296874940395355, "reward_std": 0.017016594298183918, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.296875, "rewards/tag_count_reward/std": 0.17016583308577538, "step": 404 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1006.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 796.265625, "completions/mean_terminated_length": 744.2351226806641, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.135, "grad_norm": 0.7597048282623291, "kl": 0.38818359375, "learning_rate": 1.9926872811474952e-05, "loss": 0.1247, "num_tokens": 20410114.0, "reward": 0.5359375029802322, "reward_std": 0.016042925650253892, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.16042909026145935, "step": 405 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 813.25, "completions/max_terminated_length": 725.0, "completions/mean_length": 454.0, "completions/mean_terminated_length": 444.6010437011719, "completions/min_length": 208.25, "completions/min_terminated_length": 208.25, "epoch": 0.13533333333333333, "grad_norm": 0.9214096069335938, "kl": 0.42822265625, "learning_rate": 1.9925461516413224e-05, "loss": 0.012, "num_tokens": 20447826.0, "reward": 0.5419271141290665, "reward_std": 0.019258577842265368, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.4453125, "rewards/tag_count_reward/std": 0.11347946338355541, "step": 406 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 367.0625, "completions/mean_terminated_length": 367.0625, "completions/min_length": 179.5, "completions/min_terminated_length": 179.5, "epoch": 0.13566666666666666, "grad_norm": 0.9415867328643799, "kl": 0.47021484375, "learning_rate": 1.992403678371533e-05, "loss": -0.0278, "num_tokens": 20481142.0, "reward": 0.5308593809604645, "reward_std": 0.05694529181346297, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9687500149011612, "rewards/reasoning_steps_reward/std": 0.11179708316922188, "rewards/tag_count_reward/mean": 0.46484375, "rewards/tag_count_reward/std": 0.10490768030285835, "step": 407 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.25, "completions/max_terminated_length": 600.25, "completions/mean_length": 372.21875, "completions/mean_terminated_length": 372.21875, "completions/min_length": 210.75, "completions/min_terminated_length": 210.75, "epoch": 0.136, "grad_norm": 0.909557044506073, "kl": 0.4541015625, "learning_rate": 1.9922598615310157e-05, "loss": -0.033, "num_tokens": 20513140.0, "reward": 0.48268231749534607, "reward_std": 0.129038886167109, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.8802083432674408, "rewards/reasoning_steps_reward/std": 0.2561502903699875, "rewards/tag_count_reward/mean": 0.42578125, "rewards/tag_count_reward/std": 0.12520484067499638, "step": 408 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 939.75, "completions/max_terminated_length": 784.5, "completions/mean_length": 545.140625, "completions/mean_terminated_length": 528.7177200317383, "completions/min_length": 287.5, "completions/min_terminated_length": 287.5, "epoch": 0.13633333333333333, "grad_norm": 0.7070683836936951, "kl": 0.3916015625, "learning_rate": 1.9921147013144782e-05, "loss": -0.0569, "num_tokens": 20555485.0, "reward": 0.5261718928813934, "reward_std": 0.04742140416055918, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.96875, "rewards/reasoning_steps_reward/std": 0.07375510036945343, "rewards/tag_count_reward/mean": 0.41796875, "rewards/tag_count_reward/std": 0.12568620964884758, "step": 409 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 685.40625, "completions/mean_terminated_length": 685.40625, "completions/min_length": 433.25, "completions/min_terminated_length": 433.25, "epoch": 0.13666666666666666, "grad_norm": 0.3983977437019348, "kl": 0.33984375, "learning_rate": 1.9919681979184452e-05, "loss": 0.005, "num_tokens": 20609735.0, "reward": 0.5492187440395355, "reward_std": 0.0031250000465661287, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4921875, "rewards/tag_count_reward/std": 0.03125, "step": 410 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 942.25, "completions/max_terminated_length": 937.75, "completions/mean_length": 682.890625, "completions/mean_terminated_length": 673.7812652587891, "completions/min_length": 421.25, "completions/min_terminated_length": 421.25, "epoch": 0.137, "grad_norm": 0.4985494613647461, "kl": 0.3798828125, "learning_rate": 1.9918203515412616e-05, "loss": 0.0338, "num_tokens": 20663232.0, "reward": 0.630859375, "reward_std": 0.17888134391978383, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08203125, "rewards/penalized_accuracy_reward/std": 0.17636188864707947, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.48828125, "rewards/tag_count_reward/std": 0.025194555521011353, "step": 411 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.75, "completions/mean_length": 808.875, "completions/mean_terminated_length": 773.337158203125, "completions/min_length": 520.5, "completions/min_terminated_length": 520.5, "epoch": 0.13733333333333334, "grad_norm": 0.6049057245254517, "kl": 0.3701171875, "learning_rate": 1.9916711623830904e-05, "loss": 0.0982, "num_tokens": 20726072.0, "reward": 0.5429687649011612, "reward_std": 0.014349436154589057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4296875, "rewards/tag_count_reward/std": 0.1434942465275526, "step": 412 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1021.75, "completions/max_terminated_length": 988.75, "completions/mean_length": 793.25, "completions/mean_terminated_length": 764.8887176513672, "completions/min_length": 489.75, "completions/min_terminated_length": 489.75, "epoch": 0.13766666666666666, "grad_norm": 0.5656830072402954, "kl": 0.4033203125, "learning_rate": 1.9915206306459117e-05, "loss": 0.0979, "num_tokens": 20786552.0, "reward": 0.5460937470197678, "reward_std": 0.0076294910395517945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4609375, "rewards/tag_count_reward/std": 0.07629487104713917, "step": 413 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 932.5, "completions/max_terminated_length": 862.75, "completions/mean_length": 656.46875, "completions/mean_terminated_length": 645.9791870117188, "completions/min_length": 375.5, "completions/min_terminated_length": 375.5, "epoch": 0.138, "grad_norm": 0.542533814907074, "kl": 0.3994140625, "learning_rate": 1.9913687565335237e-05, "loss": 0.0684, "num_tokens": 20836710.0, "reward": 0.6936656385660172, "reward_std": 0.2626067877281457, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14483754336833954, "rewards/penalized_accuracy_reward/std": 0.2592443823814392, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.48828125, "rewards/tag_count_reward/std": 0.046875, "step": 414 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 867.75, "completions/max_terminated_length": 823.25, "completions/mean_length": 624.984375, "completions/mean_terminated_length": 606.9050598144531, "completions/min_length": 408.75, "completions/min_terminated_length": 408.75, "epoch": 0.13833333333333334, "grad_norm": 0.6048027276992798, "kl": 0.4267578125, "learning_rate": 1.991215540251542e-05, "loss": 0.0707, "num_tokens": 20887781.0, "reward": 0.5753906220197678, "reward_std": 0.11373258696403354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02734375, "rewards/penalized_accuracy_reward/std": 0.1093750074505806, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.48046875, "rewards/tag_count_reward/std": 0.04357585124671459, "step": 415 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.25, "completions/max_terminated_length": 741.25, "completions/mean_length": 537.265625, "completions/mean_terminated_length": 537.265625, "completions/min_length": 342.5, "completions/min_terminated_length": 342.5, "epoch": 0.13866666666666666, "grad_norm": 0.7892355918884277, "kl": 0.46240234375, "learning_rate": 1.9910609820073986e-05, "loss": 0.0576, "num_tokens": 20931206.0, "reward": 0.5753906220197678, "reward_std": 0.11475004884414375, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02734375, "rewards/penalized_accuracy_reward/std": 0.1093750074505806, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.48046875, "rewards/tag_count_reward/std": 0.06822281517088413, "step": 416 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.25, "completions/max_terminated_length": 637.25, "completions/mean_length": 431.984375, "completions/mean_terminated_length": 431.984375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.139, "grad_norm": 0.6798685789108276, "kl": 0.41162109375, "learning_rate": 1.990905082010344e-05, "loss": 0.012, "num_tokens": 20968213.0, "reward": 0.5440104156732559, "reward_std": 0.022408843622542918, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.4921875, "rewards/tag_count_reward/std": 0.03125, "step": 417 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.75, "completions/max_terminated_length": 540.75, "completions/mean_length": 373.71875, "completions/mean_terminated_length": 373.71875, "completions/min_length": 242.25, "completions/min_terminated_length": 242.25, "epoch": 0.13933333333333334, "grad_norm": 0.5801618099212646, "kl": 0.49365234375, "learning_rate": 1.9907478404714438e-05, "loss": 0.0064, "num_tokens": 21002035.0, "reward": 0.6189446747303009, "reward_std": 0.22255618683993816, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07415299117565155, "rewards/penalized_accuracy_reward/std": 0.20262455940246582, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.03125, "step": 418 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 421.625, "completions/mean_terminated_length": 421.625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.13966666666666666, "grad_norm": 0.6660025119781494, "kl": 0.498046875, "learning_rate": 1.9905892576035798e-05, "loss": -0.0255, "num_tokens": 21038299.0, "reward": 0.5717447996139526, "reward_std": 0.123722143471241, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02734375, "rewards/penalized_accuracy_reward/std": 0.1093750074505806, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.028463751077651978, "rewards/tag_count_reward/mean": 0.49609375, "rewards/tag_count_reward/std": 0.015625, "step": 419 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 701.5, "completions/max_terminated_length": 629.5, "completions/mean_length": 463.1875, "completions/mean_terminated_length": 455.28334045410156, "completions/min_length": 289.5, "completions/min_terminated_length": 289.5, "epoch": 0.14, "grad_norm": 0.7708501815795898, "kl": 0.51708984375, "learning_rate": 1.9904293336214508e-05, "loss": 0.0814, "num_tokens": 21078743.0, "reward": 0.548828125, "reward_std": 0.0036972808884456754, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.48828125, "rewards/tag_count_reward/std": 0.03697281517088413, "step": 420 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 766.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 469.25, "completions/mean_terminated_length": 462.2708435058594, "completions/min_length": 277.75, "completions/min_terminated_length": 277.75, "epoch": 0.14033333333333334, "grad_norm": 0.7420164346694946, "kl": 0.5908203125, "learning_rate": 1.9902680687415704e-05, "loss": 0.0502, "num_tokens": 21118359.0, "reward": 0.7088874876499176, "reward_std": 0.2456548601621762, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.15927812457084656, "rewards/penalized_accuracy_reward/std": 0.24409236013889313, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.49609375, "rewards/tag_count_reward/std": 0.015625, "step": 421 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 708.75, "completions/max_terminated_length": 605.25, "completions/mean_length": 409.140625, "completions/mean_terminated_length": 400.08959197998047, "completions/min_length": 226.75, "completions/min_terminated_length": 226.75, "epoch": 0.14066666666666666, "grad_norm": 1.2998566627502441, "kl": 0.611328125, "learning_rate": 1.990105463182268e-05, "loss": 0.0206, "num_tokens": 21154512.0, "reward": 0.5515625029802322, "reward_std": 0.008713944582268596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.515625, "rewards/tag_count_reward/std": 0.08713950775563717, "step": 422 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 775.75, "completions/max_terminated_length": 679.75, "completions/mean_length": 483.9375, "completions/mean_terminated_length": 475.6291732788086, "completions/min_length": 239.25, "completions/min_terminated_length": 239.25, "epoch": 0.141, "grad_norm": 1.5227426290512085, "kl": 0.53173828125, "learning_rate": 1.989941517163688e-05, "loss": 0.0387, "num_tokens": 21195212.0, "reward": 0.5609374940395355, "reward_std": 0.01383043429814279, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.609375, "rewards/tag_count_reward/std": 0.13830446638166904, "step": 423 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 869.75, "completions/max_terminated_length": 859.0, "completions/mean_length": 622.78125, "completions/mean_terminated_length": 596.5625, "completions/min_length": 367.5, "completions/min_terminated_length": 367.5, "epoch": 0.14133333333333334, "grad_norm": 1.3768945932388306, "kl": 0.732421875, "learning_rate": 1.989776230907789e-05, "loss": 0.0866, "num_tokens": 21245006.0, "reward": 0.5614583343267441, "reward_std": 0.01999457157216966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.640625, "rewards/tag_count_reward/std": 0.13019821606576443, "step": 424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 941.25, "completions/max_terminated_length": 793.0, "completions/mean_length": 609.265625, "completions/mean_terminated_length": 558.4790954589844, "completions/min_length": 410.75, "completions/min_terminated_length": 410.75, "epoch": 0.14166666666666666, "grad_norm": 2.4814653396606445, "kl": 1.353515625, "learning_rate": 1.9896096046383456e-05, "loss": 0.1494, "num_tokens": 21296671.0, "reward": 0.5548176914453506, "reward_std": 0.03875686880201101, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.05442607030272484, "rewards/tag_count_reward/mean": 0.65234375, "rewards/tag_count_reward/std": 0.18830689042806625, "step": 425 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 825.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 642.125, "completions/mean_terminated_length": 458.1071472167969, "completions/min_length": 314.25, "completions/min_terminated_length": 314.25, "epoch": 0.142, "grad_norm": 3.4336211681365967, "kl": 4.107421875, "learning_rate": 1.9894416385809444e-05, "loss": 0.2316, "num_tokens": 21347255.0, "reward": 0.5065104141831398, "reward_std": 0.09406902268528938, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.911458358168602, "rewards/reasoning_steps_reward/std": 0.17035314068198204, "rewards/tag_count_reward/mean": 0.5078125, "rewards/tag_count_reward/std": 0.16901902854442596, "step": 426 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 735.015625, "completions/mean_terminated_length": 505.79168701171875, "completions/min_length": 331.75, "completions/min_terminated_length": 331.75, "epoch": 0.14233333333333334, "grad_norm": 10.202553749084473, "kl": 8.59375, "learning_rate": 1.9892723329629885e-05, "loss": 0.3914, "num_tokens": 21404472.0, "reward": 0.5317086279392242, "reward_std": 0.22374617960304022, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0341825895011425, "rewards/penalized_accuracy_reward/std": 0.13673035800457, "rewards/reasoning_steps_reward/mean": 0.9114583283662796, "rewards/reasoning_steps_reward/std": 0.22356067970395088, "rewards/tag_count_reward/mean": 0.41796875, "rewards/tag_count_reward/std": 0.2967885471880436, "step": 427 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 923.75, "completions/max_terminated_length": 847.25, "completions/mean_length": 603.96875, "completions/mean_terminated_length": 542.2116241455078, "completions/min_length": 326.5, "completions/min_terminated_length": 326.5, "epoch": 0.14266666666666666, "grad_norm": 6.315878868103027, "kl": 5.078125, "learning_rate": 1.9891016880136923e-05, "loss": 0.2559, "num_tokens": 21452550.0, "reward": 0.5242187529802322, "reward_std": 0.08881460968405008, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9218750149011612, "rewards/reasoning_steps_reward/std": 0.16595671698451042, "rewards/tag_count_reward/mean": 0.6328125, "rewards/tag_count_reward/std": 0.21946558356285095, "step": 428 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 941.5, "completions/max_terminated_length": 845.75, "completions/mean_length": 639.0625, "completions/mean_terminated_length": 571.6411590576172, "completions/min_length": 395.75, "completions/min_terminated_length": 395.75, "epoch": 0.143, "grad_norm": 4.72757625579834, "kl": 2.603515625, "learning_rate": 1.988929703964084e-05, "loss": 0.1405, "num_tokens": 21504650.0, "reward": 0.4993489533662796, "reward_std": 0.24587925523519516, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02734375, "rewards/penalized_accuracy_reward/std": 0.1093750074505806, "rewards/reasoning_steps_reward/mean": 0.8229166865348816, "rewards/reasoning_steps_reward/std": 0.2813824266195297, "rewards/tag_count_reward/mean": 0.60546875, "rewards/tag_count_reward/std": 0.215969055891037, "step": 429 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 897.75, "completions/max_terminated_length": 872.5, "completions/mean_length": 695.03125, "completions/mean_terminated_length": 678.5364685058594, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.14333333333333334, "grad_norm": 1.2827024459838867, "kl": 0.5068359375, "learning_rate": 1.988756381047006e-05, "loss": 0.0198, "num_tokens": 21559788.0, "reward": 0.47968750447034836, "reward_std": 0.14802964963018894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.8125000298023224, "rewards/reasoning_steps_reward/std": 0.2938147969543934, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.05920085124671459, "step": 430 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 965.5, "completions/max_terminated_length": 959.5, "completions/mean_length": 723.984375, "completions/mean_terminated_length": 719.9395904541016, "completions/min_length": 535.25, "completions/min_terminated_length": 535.25, "epoch": 0.14366666666666666, "grad_norm": 1.3184837102890015, "kl": 0.708984375, "learning_rate": 1.9885817194971116e-05, "loss": 0.0187, "num_tokens": 21615755.0, "reward": 0.5630208253860474, "reward_std": 0.03649181989021599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.06615880131721497, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.049619100987911224, "step": 431 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 962.5, "completions/max_terminated_length": 951.25, "completions/mean_length": 810.46875, "completions/mean_terminated_length": 799.4933166503906, "completions/min_length": 590.5, "completions/min_terminated_length": 590.5, "epoch": 0.144, "grad_norm": 5.999351978302002, "kl": 1.052734375, "learning_rate": 1.9884057195508683e-05, "loss": 0.0095, "num_tokens": 21677961.0, "reward": 0.5640624910593033, "reward_std": 0.034836260601878166, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9843750149011612, "rewards/reasoning_steps_reward/std": 0.0624999962747097, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.08570349216461182, "step": 432 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 966.8125, "completions/mean_terminated_length": 907.8899078369141, "completions/min_length": 792.25, "completions/min_terminated_length": 792.25, "epoch": 0.14433333333333334, "grad_norm": 1.0965672731399536, "kl": 0.70361328125, "learning_rate": 1.988228381446553e-05, "loss": 0.0648, "num_tokens": 21754045.0, "reward": 0.705226257443428, "reward_std": 0.23013893724419177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14350752532482147, "rewards/penalized_accuracy_reward/std": 0.22008824348449707, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6171875, "rewards/tag_count_reward/std": 0.12455067038536072, "step": 433 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 981.5625, "completions/mean_terminated_length": 658.8361206054688, "completions/min_length": 798.25, "completions/min_terminated_length": 542.25, "epoch": 0.14466666666666667, "grad_norm": 1.322916030883789, "kl": 1.2490234375, "learning_rate": 1.9880497054242566e-05, "loss": 0.0701, "num_tokens": 21827137.0, "reward": 0.5509114414453506, "reward_std": 0.023322110762819648, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.53515625, "rewards/tag_count_reward/std": 0.1606542058289051, "step": 434 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.75, "completions/mean_length": 957.296875, "completions/mean_terminated_length": 908.6064605712891, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 0.145, "grad_norm": 0.9156481623649597, "kl": 0.80517578125, "learning_rate": 1.987869691725881e-05, "loss": 0.0678, "num_tokens": 21899508.0, "reward": 0.5598958283662796, "reward_std": 0.02003988972865045, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.625, "rewards/tag_count_reward/std": 0.14209389127790928, "step": 435 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 930.546875, "completions/mean_terminated_length": 858.4285888671875, "completions/min_length": 634.75, "completions/min_terminated_length": 634.75, "epoch": 0.14533333333333334, "grad_norm": 1.8983979225158691, "kl": 1.8017578125, "learning_rate": 1.9876883405951378e-05, "loss": 0.1319, "num_tokens": 21967799.0, "reward": 0.5892978459596634, "reward_std": 0.15209323493763804, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0330478698015213, "rewards/penalized_accuracy_reward/std": 0.1321914792060852, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5625, "rewards/tag_count_reward/std": 0.230451051145792, "step": 436 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.25, "completions/mean_length": 883.671875, "completions/mean_terminated_length": 824.1293487548828, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "epoch": 0.14566666666666667, "grad_norm": 1.28177809715271, "kl": 2.025390625, "learning_rate": 1.9875056522775506e-05, "loss": 0.0907, "num_tokens": 22034738.0, "reward": 0.6134114563465118, "reward_std": 0.1763192261569202, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0546875, "rewards/penalized_accuracy_reward/std": 0.14943470060825348, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.61328125, "rewards/tag_count_reward/std": 0.20768004097044468, "step": 437 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.5, "completions/mean_length": 882.546875, "completions/mean_terminated_length": 830.4378967285156, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 0.146, "grad_norm": 2.7501630783081055, "kl": 2.984375, "learning_rate": 1.987321627020453e-05, "loss": 0.1544, "num_tokens": 22103237.0, "reward": 0.6039062291383743, "reward_std": 0.190100381616503, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0546875, "rewards/penalized_accuracy_reward/std": 0.14943470060825348, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.5703125, "rewards/tag_count_reward/std": 0.21030498296022415, "step": 438 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1020.75, "completions/max_terminated_length": 994.0, "completions/mean_length": 893.90625, "completions/mean_terminated_length": 835.4974517822266, "completions/min_length": 621.5, "completions/min_terminated_length": 621.5, "epoch": 0.14633333333333334, "grad_norm": 2.1809208393096924, "kl": 2.060546875, "learning_rate": 1.987136265072988e-05, "loss": 0.1069, "num_tokens": 22170399.0, "reward": 0.5574218779802322, "reward_std": 0.023608210729435086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.57421875, "rewards/tag_count_reward/std": 0.2360821943730116, "step": 439 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.75, "completions/mean_length": 886.671875, "completions/mean_terminated_length": 785.5016021728516, "completions/min_length": 618.5, "completions/min_terminated_length": 618.5, "epoch": 0.14666666666666667, "grad_norm": 2.85353946685791, "kl": 1.9375, "learning_rate": 1.9869495666861094e-05, "loss": 0.1311, "num_tokens": 22239306.0, "reward": 0.5869790464639664, "reward_std": 0.15728294849395752, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03385403752326965, "rewards/penalized_accuracy_reward/std": 0.1354161649942398, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.53125, "rewards/tag_count_reward/std": 0.28653404489159584, "step": 440 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.5, "completions/mean_length": 866.765625, "completions/mean_terminated_length": 788.3967895507812, "completions/min_length": 599.0, "completions/min_terminated_length": 599.0, "epoch": 0.147, "grad_norm": 1.6231287717819214, "kl": 4.0546875, "learning_rate": 1.9867615321125796e-05, "loss": 0.2683, "num_tokens": 22307451.0, "reward": 0.5582031160593033, "reward_std": 0.025998273864388466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.58203125, "rewards/tag_count_reward/std": 0.25998280197381973, "step": 441 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 819.90625, "completions/mean_terminated_length": 727.7122344970703, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.14733333333333334, "grad_norm": 3.4805145263671875, "kl": 5.953125, "learning_rate": 1.9865721616069695e-05, "loss": 0.3245, "num_tokens": 22370229.0, "reward": 0.580208346247673, "reward_std": 0.13673926563933492, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02734375, "rewards/penalized_accuracy_reward/std": 0.1093750074505806, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.5546875, "rewards/tag_count_reward/std": 0.3218095973134041, "step": 442 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 959.25, "completions/max_terminated_length": 943.75, "completions/mean_length": 802.453125, "completions/mean_terminated_length": 749.4040222167969, "completions/min_length": 568.5, "completions/min_terminated_length": 568.5, "epoch": 0.14766666666666667, "grad_norm": 1.4201202392578125, "kl": 3.9248046875, "learning_rate": 1.98638145542566e-05, "loss": 0.1667, "num_tokens": 22431634.0, "reward": 0.5447916388511658, "reward_std": 0.0626852991990745, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.0727677047252655, "rewards/tag_count_reward/mean": 0.578125, "rewards/tag_count_reward/std": 0.2809402644634247, "step": 443 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.25, "completions/mean_length": 737.53125, "completions/mean_terminated_length": 705.8508605957031, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 0.148, "grad_norm": 1.6902104616165161, "kl": 3.552734375, "learning_rate": 1.9861894138268402e-05, "loss": 0.2042, "num_tokens": 22487748.0, "reward": 0.596875011920929, "reward_std": 0.12250060332007706, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02734375, "rewards/penalized_accuracy_reward/std": 0.1093750074505806, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6953125, "rewards/tag_count_reward/std": 0.1913830190896988, "step": 444 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.75, "completions/mean_length": 830.390625, "completions/mean_terminated_length": 790.3717346191406, "completions/min_length": 603.25, "completions/min_terminated_length": 603.25, "epoch": 0.14833333333333334, "grad_norm": 1.7514704465866089, "kl": 4.2451171875, "learning_rate": 1.985996037070505e-05, "loss": 0.1826, "num_tokens": 22551117.0, "reward": 0.5587239488959312, "reward_std": 0.20655486825853586, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02734375, "rewards/penalized_accuracy_reward/std": 0.1093750074505806, "rewards/reasoning_steps_reward/mean": 0.9322916716337204, "rewards/reasoning_steps_reward/std": 0.169231366366148, "rewards/tag_count_reward/mean": 0.65234375, "rewards/tag_count_reward/std": 0.1948315743356943, "step": 445 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 958.0, "completions/max_terminated_length": 888.5, "completions/mean_length": 723.765625, "completions/mean_terminated_length": 691.06640625, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.14866666666666667, "grad_norm": 0.5837820172309875, "kl": 0.314453125, "learning_rate": 1.9858013254184597e-05, "loss": 0.0079, "num_tokens": 22606382.0, "reward": 0.5648437291383743, "reward_std": 0.01902891811914742, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6484375, "rewards/tag_count_reward/std": 0.19028928130865097, "step": 446 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 942.5, "completions/mean_length": 836.140625, "completions/mean_terminated_length": 793.2091217041016, "completions/min_length": 600.75, "completions/min_terminated_length": 600.75, "epoch": 0.149, "grad_norm": 2.6721808910369873, "kl": 0.716796875, "learning_rate": 1.9856052791343153e-05, "loss": 0.0579, "num_tokens": 22670183.0, "reward": 0.5559895634651184, "reward_std": 0.054777587531134486, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.0833333320915699, "rewards/tag_count_reward/mean": 0.6640625, "rewards/tag_count_reward/std": 0.15826414339244366, "step": 447 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 965.5, "completions/max_terminated_length": 957.5, "completions/mean_length": 766.3125, "completions/mean_terminated_length": 736.1430358886719, "completions/min_length": 514.25, "completions/min_terminated_length": 514.25, "epoch": 0.14933333333333335, "grad_norm": 0.5376331210136414, "kl": 0.31591796875, "learning_rate": 1.9854078984834904e-05, "loss": -0.0108, "num_tokens": 22729643.0, "reward": 0.6521260887384415, "reward_std": 0.28587909252382815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08532924577593803, "rewards/penalized_accuracy_reward/std": 0.27568309009075165, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.66796875, "rewards/tag_count_reward/std": 0.19684339314699173, "step": 448 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 785.25, "completions/mean_terminated_length": 747.2240753173828, "completions/min_length": 515.75, "completions/min_terminated_length": 515.75, "epoch": 0.14966666666666667, "grad_norm": 1.1380079984664917, "kl": 0.7568359375, "learning_rate": 1.985209183733209e-05, "loss": 0.0213, "num_tokens": 22789771.0, "reward": 0.6175781190395355, "reward_std": 0.16514140227809548, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0546875, "rewards/penalized_accuracy_reward/std": 0.14943470060825348, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.62890625, "rewards/tag_count_reward/std": 0.25201747938990593, "step": 449 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 796.375, "completions/mean_terminated_length": 755.7070770263672, "completions/min_length": 554.75, "completions/min_terminated_length": 554.75, "epoch": 0.15, "grad_norm": 2.242668867111206, "kl": 0.46240234375, "learning_rate": 1.985009135152503e-05, "loss": 0.0553, "num_tokens": 22853443.0, "reward": 0.5953125059604645, "reward_std": 0.12160531315021217, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02734375, "rewards/penalized_accuracy_reward/std": 0.1093750074505806, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6796875, "rewards/tag_count_reward/std": 0.1715698577463627, "step": 450 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1019.75, "completions/max_terminated_length": 991.75, "completions/mean_length": 812.578125, "completions/mean_terminated_length": 785.7073669433594, "completions/min_length": 603.75, "completions/min_terminated_length": 603.75, "epoch": 0.15033333333333335, "grad_norm": 3.406395435333252, "kl": 2.2548828125, "learning_rate": 1.9848077530122083e-05, "loss": 0.155, "num_tokens": 22917064.0, "reward": 0.5687500089406967, "reward_std": 0.015971079003065825, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6875, "rewards/tag_count_reward/std": 0.1597108170390129, "step": 451 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 796.75, "completions/mean_length": 716.71875, "completions/mean_terminated_length": 598.7527770996094, "completions/min_length": 365.25, "completions/min_terminated_length": 365.25, "epoch": 0.15066666666666667, "grad_norm": 15.443132400512695, "kl": 14.4765625, "learning_rate": 1.9846050375849674e-05, "loss": 0.7602, "num_tokens": 22972918.0, "reward": 0.531901054084301, "reward_std": 0.07599062426015735, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9583333283662796, "rewards/reasoning_steps_reward/std": 0.10116107389330864, "rewards/tag_count_reward/mean": 0.52734375, "rewards/tag_count_reward/std": 0.3163977265357971, "step": 452 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 1024.0, "completions/max_terminated_length": 709.5, "completions/mean_length": 904.53125, "completions/mean_terminated_length": 589.0020980834961, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 0.151, "grad_norm": 85.37786865234375, "kl": 39.9375, "learning_rate": 1.984400989145228e-05, "loss": 1.6947, "num_tokens": 23040920.0, "reward": 0.4795573055744171, "reward_std": 0.13686126098036766, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.895833358168602, "rewards/reasoning_steps_reward/std": 0.22046180069446564, "rewards/tag_count_reward/mean": 0.25390625, "rewards/tag_count_reward/std": 0.3303435668349266, "step": 453 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 1024.0, "completions/max_terminated_length": 767.5, "completions/mean_length": 869.421875, "completions/mean_terminated_length": 611.6889953613281, "completions/min_length": 506.25, "completions/min_terminated_length": 506.25, "epoch": 0.15133333333333332, "grad_norm": 55.974674224853516, "kl": 36.25, "learning_rate": 1.984195607969242e-05, "loss": 1.5795, "num_tokens": 23105427.0, "reward": 0.4514322876930237, "reward_std": 0.16104818508028984, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.8489583730697632, "rewards/reasoning_steps_reward/std": 0.2876831628382206, "rewards/tag_count_reward/mean": 0.26953125, "rewards/tag_count_reward/std": 0.33850353956222534, "step": 454 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 970.5, "completions/max_terminated_length": 839.25, "completions/mean_length": 828.875, "completions/mean_terminated_length": 645.25, "completions/min_length": 445.75, "completions/min_terminated_length": 445.75, "epoch": 0.15166666666666667, "grad_norm": 21.950531005859375, "kl": 15.4345703125, "learning_rate": 1.9839888943350656e-05, "loss": 0.7035, "num_tokens": 23169291.0, "reward": 0.5143229141831398, "reward_std": 0.11808320507407188, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9427083432674408, "rewards/reasoning_steps_reward/std": 0.16406626999378204, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.26722504384815693, "step": 455 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 626.75, "completions/mean_length": 802.34375, "completions/mean_terminated_length": 463.2026672363281, "completions/min_length": 539.5, "completions/min_terminated_length": 283.5, "epoch": 0.152, "grad_norm": 4.777839660644531, "kl": 4.240234375, "learning_rate": 1.983780848522559e-05, "loss": 0.2407, "num_tokens": 23232369.0, "reward": 0.5622395873069763, "reward_std": 0.1430051177740097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02734375, "rewards/penalized_accuracy_reward/std": 0.1093750074505806, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.03726779669523239, "rewards/tag_count_reward/mean": 0.453125, "rewards/tag_count_reward/std": 0.24603138118982315, "step": 456 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.5, "completions/mean_length": 747.3125, "completions/mean_terminated_length": 653.7940368652344, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.15233333333333332, "grad_norm": 4.815750598907471, "kl": 0.734375, "learning_rate": 1.983571470813386e-05, "loss": 0.1456, "num_tokens": 23289381.0, "reward": 0.6244791597127914, "reward_std": 0.16721098124980927, "rewards/format_reward/mean": 0.15625, "rewards/format_reward/std": 0.3529609143733978, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.0416666641831398, "rewards/tag_count_reward/mean": 0.671875, "rewards/tag_count_reward/std": 0.3039328083395958, "step": 457 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 776.75, "completions/mean_length": 774.171875, "completions/mean_terminated_length": 562.4405136108398, "completions/min_length": 318.75, "completions/min_terminated_length": 318.75, "epoch": 0.15266666666666667, "grad_norm": 6.34462308883667, "kl": 1.1328125, "learning_rate": 1.983360761491014e-05, "loss": 0.1067, "num_tokens": 23354496.0, "reward": 0.6123213768005371, "reward_std": 0.32740413025021553, "rewards/format_reward/mean": 0.15625, "rewards/format_reward/std": 0.3604728877544403, "rewards/penalized_accuracy_reward/mean": 0.05125368386507034, "rewards/penalized_accuracy_reward/std": 0.14041048288345337, "rewards/reasoning_steps_reward/mean": 0.9010416865348816, "rewards/reasoning_steps_reward/std": 0.2615289017558098, "rewards/tag_count_reward/mean": 0.48046875, "rewards/tag_count_reward/std": 0.36171500384807587, "step": 458 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.25, "completions/mean_length": 672.09375, "completions/mean_terminated_length": 595.3750228881836, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.153, "grad_norm": 4.3616862297058105, "kl": 0.9384765625, "learning_rate": 1.9831487208407126e-05, "loss": 0.1203, "num_tokens": 23405526.0, "reward": 0.6326136887073517, "reward_std": 0.2799901254475117, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.28694770485162735, "rewards/penalized_accuracy_reward/mean": 0.03743140399456024, "rewards/penalized_accuracy_reward/std": 0.14972561597824097, "rewards/reasoning_steps_reward/mean": 0.9583333432674408, "rewards/reasoning_steps_reward/std": 0.13070852309465408, "rewards/tag_count_reward/mean": 0.66015625, "rewards/tag_count_reward/std": 0.31298641115427017, "step": 459 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 992.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 654.8125, "completions/mean_terminated_length": 599.49853515625, "completions/min_length": 309.25, "completions/min_terminated_length": 309.25, "epoch": 0.15333333333333332, "grad_norm": 4.061674118041992, "kl": 1.5810546875, "learning_rate": 1.9829353491495545e-05, "loss": 0.1745, "num_tokens": 23459066.0, "reward": 0.7437500059604645, "reward_std": 0.255161315202713, "rewards/format_reward/mean": 0.453125, "rewards/format_reward/std": 0.5069767236709595, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.96875, "rewards/reasoning_steps_reward/std": 0.125, "rewards/tag_count_reward/mean": 0.78125, "rewards/tag_count_reward/std": 0.28599051013588905, "step": 460 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 977.25, "completions/max_terminated_length": 964.25, "completions/mean_length": 608.140625, "completions/mean_terminated_length": 590.8534393310547, "completions/min_length": 363.5, "completions/min_terminated_length": 363.5, "epoch": 0.15366666666666667, "grad_norm": 3.283140182495117, "kl": 1.380859375, "learning_rate": 1.9827206467064133e-05, "loss": 0.1406, "num_tokens": 23507267.0, "reward": 0.930468738079071, "reward_std": 0.15995598956942558, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3723389655351639, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1550494320690632, "step": 461 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1009.25, "completions/max_terminated_length": 921.0, "completions/mean_length": 664.59375, "completions/mean_terminated_length": 563.5781402587891, "completions/min_length": 139.75, "completions/min_terminated_length": 139.75, "epoch": 0.154, "grad_norm": 5.861054420471191, "kl": 7.4375, "learning_rate": 1.9825046138019658e-05, "loss": 0.3875, "num_tokens": 23560393.0, "reward": 0.7765624970197678, "reward_std": 0.28909046202898026, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.46039126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.90625, "rewards/reasoning_steps_reward/std": 0.28414636105298996, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.29997236654162407, "step": 462 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 970.25, "completions/max_terminated_length": 860.25, "completions/mean_length": 560.21875, "completions/mean_terminated_length": 537.9114761352539, "completions/min_length": 206.5, "completions/min_terminated_length": 206.5, "epoch": 0.15433333333333332, "grad_norm": 2.371166467666626, "kl": 2.4482421875, "learning_rate": 1.982287250728689e-05, "loss": 0.0957, "num_tokens": 23607447.0, "reward": 0.8838541507720947, "reward_std": 0.19957701489329338, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4000816270709038, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.1041666641831398, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.26904767379164696, "step": 463 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.5, "completions/max_terminated_length": 893.5, "completions/mean_length": 615.984375, "completions/mean_terminated_length": 615.984375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.15466666666666667, "grad_norm": 1.8780430555343628, "kl": 1.3984375, "learning_rate": 1.9820685577808604e-05, "loss": 0.1123, "num_tokens": 23656662.0, "reward": 0.9496093541383743, "reward_std": 0.12988915853202343, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.31116948276758194, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1318533569574356, "step": 464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 976.75, "completions/max_terminated_length": 786.5, "completions/mean_length": 611.21875, "completions/mean_terminated_length": 590.4094009399414, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.155, "grad_norm": 1.3947129249572754, "kl": 3.5634765625, "learning_rate": 1.9818485352545595e-05, "loss": 0.2216, "num_tokens": 23706036.0, "reward": 0.8876301944255829, "reward_std": 0.18621249124407768, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.41898179799318314, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9947916716337204, "rewards/reasoning_steps_reward/std": 0.0208333320915699, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.17755008302628994, "step": 465 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 928.25, "completions/max_terminated_length": 807.25, "completions/mean_length": 504.75, "completions/mean_terminated_length": 485.90313720703125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.15533333333333332, "grad_norm": 2.718474864959717, "kl": 3.38525390625, "learning_rate": 1.9816271834476642e-05, "loss": 0.1876, "num_tokens": 23751172.0, "reward": 0.8970052301883698, "reward_std": 0.2223619632422924, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38336414843797684, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9479166716337204, "rewards/reasoning_steps_reward/std": 0.14994098246097565, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.19615886360406876, "step": 466 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 629.109375, "completions/mean_terminated_length": 511.8250274658203, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.15566666666666668, "grad_norm": 14.512189865112305, "kl": 15.4140625, "learning_rate": 1.981404502659853e-05, "loss": 0.7657, "num_tokens": 23803083.0, "reward": 0.7504923716187477, "reward_std": 0.5557838007807732, "rewards/format_reward/mean": 0.5, "rewards/format_reward/std": 0.46566852182149887, "rewards/penalized_accuracy_reward/mean": 0.0932006947696209, "rewards/penalized_accuracy_reward/std": 0.3028942197561264, "rewards/reasoning_steps_reward/mean": 0.7708333432674408, "rewards/reasoning_steps_reward/std": 0.3734225407242775, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.3007545731961727, "step": 467 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 592.25, "completions/mean_length": 527.34375, "completions/mean_terminated_length": 389.0742645263672, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.156, "grad_norm": 12.772680282592773, "kl": 17.3203125, "learning_rate": 1.981180493192603e-05, "loss": 0.9476, "num_tokens": 23849377.0, "reward": 0.6187500059604645, "reward_std": 0.3620525300502777, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.500852182507515, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.765625, "rewards/reasoning_steps_reward/std": 0.37763215601444244, "rewards/tag_count_reward/mean": 0.671875, "rewards/tag_count_reward/std": 0.33192089945077896, "step": 468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 896.25, "completions/max_terminated_length": 853.25, "completions/mean_length": 513.234375, "completions/mean_terminated_length": 487.4890365600586, "completions/min_length": 267.5, "completions/min_terminated_length": 267.5, "epoch": 0.15633333333333332, "grad_norm": 2.41903018951416, "kl": 1.72265625, "learning_rate": 1.9809551553491918e-05, "loss": 0.103, "num_tokens": 23893920.0, "reward": 0.8069010525941849, "reward_std": 0.2257729135453701, "rewards/format_reward/mean": 0.625, "rewards/format_reward/std": 0.47360680997371674, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.08538305386900902, "rewards/tag_count_reward/mean": 0.69921875, "rewards/tag_count_reward/std": 0.27607953548431396, "step": 469 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.5, "completions/max_terminated_length": 673.5, "completions/mean_length": 471.96875, "completions/mean_terminated_length": 471.96875, "completions/min_length": 249.5, "completions/min_terminated_length": 249.5, "epoch": 0.15666666666666668, "grad_norm": 1.0782326459884644, "kl": 0.48193359375, "learning_rate": 1.980728489434693e-05, "loss": 0.0287, "num_tokens": 23933182.0, "reward": 0.9272077232599258, "reward_std": 0.27451132610440254, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.4079566150903702, "rewards/penalized_accuracy_reward/mean": 0.030723346397280693, "rewards/penalized_accuracy_reward/std": 0.12289339303970337, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.16636842116713524, "step": 470 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 990.25, "completions/max_terminated_length": 668.25, "completions/mean_length": 426.734375, "completions/mean_terminated_length": 386.1576042175293, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.157, "grad_norm": 2.5335872173309326, "kl": 3.2294921875, "learning_rate": 1.9805004957559795e-05, "loss": 0.1511, "num_tokens": 23971037.0, "reward": 0.8740885406732559, "reward_std": 0.20123326405882835, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4440634250640869, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9739583432674408, "rewards/reasoning_steps_reward/std": 0.08538305386900902, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.22462255880236626, "step": 471 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 839.0, "completions/max_terminated_length": 788.5, "completions/mean_length": 539.578125, "completions/mean_terminated_length": 532.3395843505859, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.15733333333333333, "grad_norm": 4.432992935180664, "kl": 1.138671875, "learning_rate": 1.9802711746217222e-05, "loss": 0.032, "num_tokens": 24015810.0, "reward": 1.0317133069038391, "reward_std": 0.2996555743739009, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.23328252136707306, "rewards/penalized_accuracy_reward/mean": 0.07611434161663055, "rewards/penalized_accuracy_reward/std": 0.20798414945602417, "rewards/reasoning_steps_reward/mean": 0.9791666716337204, "rewards/reasoning_steps_reward/std": 0.0833333320915699, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07779237069189548, "step": 472 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 177.25, "completions/mean_length": 931.671875, "completions/mean_terminated_length": 107.75, "completions/min_length": 313.0, "completions/min_terminated_length": 57.0, "epoch": 0.15766666666666668, "grad_norm": 55.799678802490234, "kl": 27.28125, "learning_rate": 1.980040526342388e-05, "loss": 1.1767, "num_tokens": 24085693.0, "reward": 0.2610677070915699, "reward_std": 0.2550016790628433, "rewards/format_reward/mean": 0.046875, "rewards/format_reward/std": 0.14789126068353653, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.416666679084301, "rewards/reasoning_steps_reward/std": 0.4453532323241234, "rewards/tag_count_reward/mean": 0.33984375, "rewards/tag_count_reward/std": 0.24816539511084557, "step": 473 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.25, "completions/mean_length": 1008.015625, "completions/mean_terminated_length": 0.25, "completions/min_length": 768.25, "completions/min_terminated_length": 0.25, "epoch": 0.158, "grad_norm": 48.84685516357422, "kl": 19.625, "learning_rate": 1.9798085512302418e-05, "loss": 0.7711, "num_tokens": 24160318.0, "reward": 0.12356770969927311, "reward_std": 0.14149870537221432, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.2135416716337204, "rewards/reasoning_steps_reward/std": 0.27610698342323303, "rewards/tag_count_reward/mean": 0.16796875, "rewards/tag_count_reward/std": 0.12979382276535034, "step": 474 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 230.5, "completions/mean_length": 1022.40625, "completions/mean_terminated_length": 230.5, "completions/min_length": 998.5, "completions/min_terminated_length": 230.5, "epoch": 0.15833333333333333, "grad_norm": 25.87973976135254, "kl": 9.75, "learning_rate": 1.979575249599344e-05, "loss": 0.3893, "num_tokens": 24236520.0, "reward": 0.10117187723517418, "reward_std": 0.12787336483597755, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.1718750037252903, "rewards/reasoning_steps_reward/std": 0.24886878952383995, "rewards/tag_count_reward/mean": 0.15234375, "rewards/tag_count_reward/std": 0.1389313079416752, "step": 475 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.15866666666666668, "grad_norm": 9.469910621643066, "kl": 3.26953125, "learning_rate": 1.9793406217655516e-05, "loss": 0.1307, "num_tokens": 24314728.0, "reward": 0.08541666809469461, "reward_std": 0.1373102180659771, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.1458333320915699, "rewards/reasoning_steps_reward/std": 0.2653312236070633, "rewards/tag_count_reward/mean": 0.125, "rewards/tag_count_reward/std": 0.12704972177743912, "step": 476 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.159, "grad_norm": 1.5597959756851196, "kl": 0.5771484375, "learning_rate": 1.979104668046516e-05, "loss": 0.0231, "num_tokens": 24389032.0, "reward": 0.08945313096046448, "reward_std": 0.12571851909160614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.15625000558793545, "rewards/reasoning_steps_reward/std": 0.2400699369609356, "rewards/tag_count_reward/mean": 0.11328125, "rewards/tag_count_reward/std": 0.12099427729845047, "step": 477 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.15933333333333333, "grad_norm": 0.8009725213050842, "kl": 0.168701171875, "learning_rate": 1.9788673887616852e-05, "loss": 0.0067, "num_tokens": 24464520.0, "reward": 0.13242188096046448, "reward_std": 0.12582270056009293, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.2343750186264515, "rewards/reasoning_steps_reward/std": 0.23368741944432259, "rewards/tag_count_reward/mean": 0.15234375, "rewards/tag_count_reward/std": 0.11941073834896088, "step": 478 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.15966666666666668, "grad_norm": 0.558620035648346, "kl": 0.153076171875, "learning_rate": 1.9786287842323002e-05, "loss": 0.0061, "num_tokens": 24541576.0, "reward": 0.21380207687616348, "reward_std": 0.12126775458455086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.385416679084301, "rewards/reasoning_steps_reward/std": 0.23433792963624, "rewards/tag_count_reward/mean": 0.2109375, "rewards/tag_count_reward/std": 0.07206955552101135, "step": 479 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16, "grad_norm": 0.27308619022369385, "kl": 0.1376953125, "learning_rate": 1.978388854781397e-05, "loss": 0.0055, "num_tokens": 24616664.0, "reward": 0.3531249910593033, "reward_std": 0.1377511229366064, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.6562500149011612, "rewards/reasoning_steps_reward/std": 0.2755022719502449, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 480 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16033333333333333, "grad_norm": 0.35946178436279297, "kl": 0.1312255859375, "learning_rate": 1.9781476007338058e-05, "loss": 0.0053, "num_tokens": 24691752.0, "reward": 0.4755208194255829, "reward_std": 0.09098472259938717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9010416865348816, "rewards/reasoning_steps_reward/std": 0.18196947127580643, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 481 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16066666666666668, "grad_norm": 0.09131309390068054, "kl": 0.1270751953125, "learning_rate": 1.9779050224161494e-05, "loss": 0.0051, "num_tokens": 24767704.0, "reward": 0.5249999761581421, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 482 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.161, "grad_norm": 0.08760453760623932, "kl": 0.1317138671875, "learning_rate": 1.9776611201568434e-05, "loss": 0.0053, "num_tokens": 24847144.0, "reward": 0.5249999761581421, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 483 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16133333333333333, "grad_norm": 0.23719899356365204, "kl": 0.150146484375, "learning_rate": 1.9774158942860962e-05, "loss": 0.006, "num_tokens": 24921944.0, "reward": 0.5164062231779099, "reward_std": 0.031114285811781883, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.984375, "rewards/reasoning_steps_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.2421875, "rewards/tag_count_reward/std": 0.021347815170884132, "step": 484 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16166666666666665, "grad_norm": 0.13751773536205292, "kl": 0.21533203125, "learning_rate": 1.977169345135908e-05, "loss": 0.0086, "num_tokens": 24995352.0, "reward": 0.5249999761581421, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 485 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.162, "grad_norm": 0.14502756297588348, "kl": 0.246337890625, "learning_rate": 1.976921473040071e-05, "loss": 0.0099, "num_tokens": 25070408.0, "reward": 0.5249999761581421, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 486 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16233333333333333, "grad_norm": 0.2363901138305664, "kl": 0.29052734375, "learning_rate": 1.9766722783341682e-05, "loss": 0.0116, "num_tokens": 25143960.0, "reward": 0.5249999761581421, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 487 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16266666666666665, "grad_norm": 1.5923470258712769, "kl": 0.35986328125, "learning_rate": 1.976421761355572e-05, "loss": 0.0144, "num_tokens": 25218552.0, "reward": 0.5253905951976776, "reward_std": 0.0015625039814040065, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25390625, "rewards/tag_count_reward/std": 0.015625, "step": 488 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.163, "grad_norm": 0.37030932307243347, "kl": 0.3203125, "learning_rate": 1.9761699224434476e-05, "loss": 0.0128, "num_tokens": 25295160.0, "reward": 0.524218738079071, "reward_std": 0.0031250000465661287, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2421875, "rewards/tag_count_reward/std": 0.03125, "step": 489 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 1024.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 827.359375, "completions/mean_terminated_length": 470.5062484741211, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.16333333333333333, "grad_norm": 0.975154459476471, "kl": 0.38525390625, "learning_rate": 1.9759167619387474e-05, "loss": 0.2072, "num_tokens": 25360095.0, "reward": 0.6637881994247437, "reward_std": 0.2572151683270931, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.43655145168304443, "rewards/penalized_accuracy_reward/mean": 0.024725718423724174, "rewards/penalized_accuracy_reward/std": 0.09890288859605789, "rewards/reasoning_steps_reward/mean": 1.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.390625, "rewards/tag_count_reward/std": 0.23340947180986404, "step": 490 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1024.0, "completions/max_terminated_length": 414.5, "completions/mean_length": 327.515625, "completions/mean_terminated_length": 255.17144012451172, "completions/min_length": 142.5, "completions/min_terminated_length": 142.5, "epoch": 0.16366666666666665, "grad_norm": 2.5863473415374756, "kl": 0.46240234375, "learning_rate": 1.9756622801842144e-05, "loss": 0.6562, "num_tokens": 25391056.0, "reward": 0.9365885406732559, "reward_std": 0.14823532104492188, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29578252136707306, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9895833432674408, "rewards/reasoning_steps_reward/std": 0.028463751077651978, "rewards/tag_count_reward/mean": 0.79296875, "rewards/tag_count_reward/std": 0.37085768580436707, "step": 491 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 427.5, "completions/mean_length": 460.296875, "completions/mean_terminated_length": 182.46656799316406, "completions/min_length": 59.25, "completions/min_terminated_length": 59.25, "epoch": 0.164, "grad_norm": 2.3596835136413574, "kl": 0.416015625, "learning_rate": 1.9754064775243797e-05, "loss": 0.6765, "num_tokens": 25431619.0, "reward": 0.7888020873069763, "reward_std": 0.22947583347558975, "rewards/format_reward/mean": 0.609375, "rewards/format_reward/std": 0.4819520115852356, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.9635416865348816, "rewards/reasoning_steps_reward/std": 0.11545588448643684, "rewards/tag_count_reward/mean": 0.6328125, "rewards/tag_count_reward/std": 0.3889954835176468, "step": 492 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 354.59375, "completions/mean_terminated_length": 70.25404167175293, "completions/min_length": 22.25, "completions/min_terminated_length": 22.25, "epoch": 0.16433333333333333, "grad_norm": 6.96124267578125, "kl": 0.9287109375, "learning_rate": 1.9751493543055634e-05, "loss": 0.326, "num_tokens": 25464729.0, "reward": 0.5072916820645332, "reward_std": 0.27873288094997406, "rewards/format_reward/mean": 0.609375, "rewards/format_reward/std": 0.48605145514011383, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.3802083432674408, "rewards/reasoning_steps_reward/std": 0.4222742021083832, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.30691851302981377, "step": 493 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16466666666666666, "grad_norm": 6.878725051879883, "kl": 4.8125, "learning_rate": 1.9748909108758727e-05, "loss": 0.1925, "num_tokens": 25541753.0, "reward": 0.17213542014360428, "reward_std": 0.18678832054138184, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.3177083507180214, "rewards/reasoning_steps_reward/std": 0.3730955421924591, "rewards/tag_count_reward/mean": 0.1328125, "rewards/tag_count_reward/std": 0.11971627920866013, "step": 494 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.165, "grad_norm": 2.3972485065460205, "kl": 1.1015625, "learning_rate": 1.9746311475852028e-05, "loss": 0.0441, "num_tokens": 25618601.0, "reward": 0.18919270858168602, "reward_std": 0.2180721014738083, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.3697916865348816, "rewards/reasoning_steps_reward/std": 0.4412023276090622, "rewards/tag_count_reward/mean": 0.04296875, "rewards/tag_count_reward/std": 0.08957063034176826, "step": 495 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16533333333333333, "grad_norm": 0.005385370459407568, "kl": 0.049560546875, "learning_rate": 1.9743700647852356e-05, "loss": 0.002, "num_tokens": 25693913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 496 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16566666666666666, "grad_norm": 0.0007258623954840004, "kl": 0.04669189453125, "learning_rate": 1.9741076628294387e-05, "loss": 0.0019, "num_tokens": 25769449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 497 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.166, "grad_norm": 4.1728949327080045e-06, "kl": 0.044189453125, "learning_rate": 1.9738439420730674e-05, "loss": 0.0018, "num_tokens": 25847465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 498 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16633333333333333, "grad_norm": 3.0347960091603454e-06, "kl": 0.04339599609375, "learning_rate": 1.9735789028731603e-05, "loss": 0.0017, "num_tokens": 25925305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 499 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16666666666666666, "grad_norm": 2.655534899531631e-06, "kl": 0.04791259765625, "learning_rate": 1.973312545588543e-05, "loss": 0.0019, "num_tokens": 25999129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 500 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.167, "grad_norm": 3.536061512932065e-06, "kl": 0.046630859375, "learning_rate": 1.973044870579824e-05, "loss": 0.0019, "num_tokens": 26074825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 501 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16733333333333333, "grad_norm": 2.505825477783219e-06, "kl": 0.0421142578125, "learning_rate": 1.972775878209397e-05, "loss": 0.0017, "num_tokens": 26151913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 502 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16766666666666666, "grad_norm": 6.029501037119189e-06, "kl": 0.0399169921875, "learning_rate": 1.9725055688414378e-05, "loss": 0.0016, "num_tokens": 26234457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 503 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.168, "grad_norm": 4.605010872182902e-06, "kl": 0.04815673828125, "learning_rate": 1.972233942841907e-05, "loss": 0.0019, "num_tokens": 26310425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 504 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16833333333333333, "grad_norm": 9.006074833450839e-06, "kl": 0.0472412109375, "learning_rate": 1.9719610005785466e-05, "loss": 0.0019, "num_tokens": 26388953.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 505 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16866666666666666, "grad_norm": 0.0003149510594084859, "kl": 0.04443359375, "learning_rate": 1.9716867424208805e-05, "loss": 0.0018, "num_tokens": 26470969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 506 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.169, "grad_norm": 5.774768396804575e-06, "kl": 0.04522705078125, "learning_rate": 1.9714111687402146e-05, "loss": 0.0018, "num_tokens": 26547817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 507 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16933333333333334, "grad_norm": 5.529911049961811e-06, "kl": 0.04559326171875, "learning_rate": 1.971134279909636e-05, "loss": 0.0018, "num_tokens": 26623753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 508 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16966666666666666, "grad_norm": 4.581507710099686e-06, "kl": 0.04547119140625, "learning_rate": 1.970856076304012e-05, "loss": 0.0018, "num_tokens": 26700681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 509 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17, "grad_norm": 5.098050223750761e-06, "kl": 0.04620361328125, "learning_rate": 1.97057655829999e-05, "loss": 0.0018, "num_tokens": 26775929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 510 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17033333333333334, "grad_norm": 5.974345185677521e-06, "kl": 0.04779052734375, "learning_rate": 1.9702957262759964e-05, "loss": 0.0019, "num_tokens": 26850905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 511 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17066666666666666, "grad_norm": 0.0003325820725876838, "kl": 0.04296875, "learning_rate": 1.9700135806122378e-05, "loss": 0.0017, "num_tokens": 26930553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 512 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.171, "grad_norm": 0.00033304333919659257, "kl": 0.04541015625, "learning_rate": 1.969730121690698e-05, "loss": 0.0018, "num_tokens": 27005097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 513 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17133333333333334, "grad_norm": 2.6074931156472303e-06, "kl": 0.04351806640625, "learning_rate": 1.9694453498951392e-05, "loss": 0.0017, "num_tokens": 27078441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 514 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17166666666666666, "grad_norm": 0.00036497588735073805, "kl": 0.04791259765625, "learning_rate": 1.969159265611101e-05, "loss": 0.0019, "num_tokens": 27155305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 515 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.172, "grad_norm": 4.051561063533882e-06, "kl": 0.04730224609375, "learning_rate": 1.9688718692259007e-05, "loss": 0.0019, "num_tokens": 27231033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17233333333333334, "grad_norm": 7.149666998884641e-06, "kl": 0.0462646484375, "learning_rate": 1.9685831611286312e-05, "loss": 0.0019, "num_tokens": 27308425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 517 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17266666666666666, "grad_norm": 2.3875866190792294e-06, "kl": 0.0418701171875, "learning_rate": 1.968293141710161e-05, "loss": 0.0017, "num_tokens": 27382729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 518 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.173, "grad_norm": 4.664221705752425e-06, "kl": 0.04730224609375, "learning_rate": 1.9680018113631347e-05, "loss": 0.0019, "num_tokens": 27457977.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 519 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17333333333333334, "grad_norm": 2.3671229882893385e-06, "kl": 0.04437255859375, "learning_rate": 1.9677091704819714e-05, "loss": 0.0018, "num_tokens": 27532809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 520 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17366666666666666, "grad_norm": 3.5438849863567157e-06, "kl": 0.04730224609375, "learning_rate": 1.967415219462864e-05, "loss": 0.0019, "num_tokens": 27608233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 521 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.174, "grad_norm": 0.00036429663305170834, "kl": 0.04412841796875, "learning_rate": 1.96711995870378e-05, "loss": 0.0018, "num_tokens": 27685001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 522 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17433333333333334, "grad_norm": 3.541435717124841e-06, "kl": 0.04229736328125, "learning_rate": 1.9668233886044597e-05, "loss": 0.0017, "num_tokens": 27760857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 523 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17466666666666666, "grad_norm": 3.7687332223867998e-06, "kl": 0.04852294921875, "learning_rate": 1.9665255095664155e-05, "loss": 0.0019, "num_tokens": 27839465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 524 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.175, "grad_norm": 3.8078387660789303e-06, "kl": 0.044921875, "learning_rate": 1.966226321992933e-05, "loss": 0.0018, "num_tokens": 27914441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 525 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17533333333333334, "grad_norm": 4.219351922074566e-06, "kl": 0.05010986328125, "learning_rate": 1.9659258262890683e-05, "loss": 0.002, "num_tokens": 27989225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 526 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17566666666666667, "grad_norm": 4.180098585493397e-06, "kl": 0.04632568359375, "learning_rate": 1.9656240228616496e-05, "loss": 0.0019, "num_tokens": 28066457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 527 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.176, "grad_norm": 3.0759954370296327e-06, "kl": 0.04669189453125, "learning_rate": 1.9653209121192747e-05, "loss": 0.0019, "num_tokens": 28142601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 528 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17633333333333334, "grad_norm": 1.986911911444622e-06, "kl": 0.04364013671875, "learning_rate": 1.9650164944723116e-05, "loss": 0.0017, "num_tokens": 28218105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 529 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17666666666666667, "grad_norm": 4.626205736713018e-06, "kl": 0.0460205078125, "learning_rate": 1.964710770332898e-05, "loss": 0.0018, "num_tokens": 28294137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 530 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.177, "grad_norm": 2.776182554953266e-06, "kl": 0.0447998046875, "learning_rate": 1.964403740114939e-05, "loss": 0.0018, "num_tokens": 28369001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 531 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17733333333333334, "grad_norm": 2.6137449822272174e-06, "kl": 0.0411376953125, "learning_rate": 1.96409540423411e-05, "loss": 0.0016, "num_tokens": 28448153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 532 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17766666666666667, "grad_norm": 2.546193627495086e-06, "kl": 0.04632568359375, "learning_rate": 1.9637857631078532e-05, "loss": 0.0019, "num_tokens": 28525161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 533 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.178, "grad_norm": 2.155697984562721e-06, "kl": 0.04559326171875, "learning_rate": 1.9634748171553775e-05, "loss": 0.0018, "num_tokens": 28607017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 534 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17833333333333334, "grad_norm": 0.00035390304401516914, "kl": 0.0439453125, "learning_rate": 1.9631625667976584e-05, "loss": 0.0018, "num_tokens": 28682473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 535 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17866666666666667, "grad_norm": 2.281694378325483e-06, "kl": 0.04302978515625, "learning_rate": 1.962849012457438e-05, "loss": 0.0017, "num_tokens": 28758553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 536 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.179, "grad_norm": 3.3165251807076856e-06, "kl": 0.04718017578125, "learning_rate": 1.9625341545592226e-05, "loss": 0.0019, "num_tokens": 28835049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 537 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17933333333333334, "grad_norm": 3.200831315552932e-06, "kl": 0.04412841796875, "learning_rate": 1.9622179935292855e-05, "loss": 0.0018, "num_tokens": 28910745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 538 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17966666666666667, "grad_norm": 3.6182398162054596e-06, "kl": 0.04736328125, "learning_rate": 1.9619005297956623e-05, "loss": 0.0019, "num_tokens": 28988441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 539 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18, "grad_norm": 2.3984318886505207e-06, "kl": 0.045654296875, "learning_rate": 1.961581763788152e-05, "loss": 0.0018, "num_tokens": 29064473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 540 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18033333333333335, "grad_norm": 2.6171785521000857e-06, "kl": 0.04656982421875, "learning_rate": 1.961261695938319e-05, "loss": 0.0019, "num_tokens": 29139913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 541 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18066666666666667, "grad_norm": 1.799889219000761e-06, "kl": 0.04571533203125, "learning_rate": 1.960940326679488e-05, "loss": 0.0018, "num_tokens": 29220009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 542 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.181, "grad_norm": 2.294459136464866e-06, "kl": 0.04736328125, "learning_rate": 1.9606176564467465e-05, "loss": 0.0019, "num_tokens": 29294457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 543 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18133333333333335, "grad_norm": 0.00036517292028293014, "kl": 0.04388427734375, "learning_rate": 1.9602936856769432e-05, "loss": 0.0018, "num_tokens": 29369849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 544 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18166666666666667, "grad_norm": 1.4415670648304513e-06, "kl": 0.048095703125, "learning_rate": 1.9599684148086876e-05, "loss": 0.0019, "num_tokens": 29444297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 545 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.182, "grad_norm": 1.974558017536765e-06, "kl": 0.047607421875, "learning_rate": 1.9596418442823495e-05, "loss": 0.0019, "num_tokens": 29519065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 546 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18233333333333332, "grad_norm": 2.0785569176950958e-06, "kl": 0.04736328125, "learning_rate": 1.9593139745400575e-05, "loss": 0.0019, "num_tokens": 29602025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 547 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18266666666666667, "grad_norm": 2.3286127088795183e-06, "kl": 0.04534912109375, "learning_rate": 1.9589848060257007e-05, "loss": 0.0018, "num_tokens": 29679209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 548 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.183, "grad_norm": 2.620727173052728e-06, "kl": 0.04559326171875, "learning_rate": 1.9586543391849243e-05, "loss": 0.0018, "num_tokens": 29753833.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 549 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18333333333333332, "grad_norm": 2.065638000203762e-06, "kl": 0.0462646484375, "learning_rate": 1.9583225744651334e-05, "loss": 0.0018, "num_tokens": 29830553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 550 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18366666666666667, "grad_norm": 1.8364229390499531e-06, "kl": 0.04766845703125, "learning_rate": 1.957989512315489e-05, "loss": 0.0019, "num_tokens": 29907289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 551 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.184, "grad_norm": 1.488981069996953e-06, "kl": 0.044677734375, "learning_rate": 1.9576551531869092e-05, "loss": 0.0018, "num_tokens": 29982521.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 552 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18433333333333332, "grad_norm": 2.238492925243918e-06, "kl": 0.04364013671875, "learning_rate": 1.9573194975320672e-05, "loss": 0.0017, "num_tokens": 30058473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 553 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18466666666666667, "grad_norm": 0.0006856236141175032, "kl": 0.0452880859375, "learning_rate": 1.956982545805393e-05, "loss": 0.0018, "num_tokens": 30136265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 554 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.185, "grad_norm": 2.4181592834793264e-06, "kl": 0.04620361328125, "learning_rate": 1.95664429846307e-05, "loss": 0.0018, "num_tokens": 30211449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 555 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18533333333333332, "grad_norm": 0.00036669173277914524, "kl": 0.0467529296875, "learning_rate": 1.9563047559630356e-05, "loss": 0.0019, "num_tokens": 30287193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 556 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18566666666666667, "grad_norm": 1.4481156540568918e-06, "kl": 0.04388427734375, "learning_rate": 1.9559639187649817e-05, "loss": 0.0018, "num_tokens": 30361401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 557 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.186, "grad_norm": 1.5893500631136703e-06, "kl": 0.0452880859375, "learning_rate": 1.9556217873303526e-05, "loss": 0.0018, "num_tokens": 30435385.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 558 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18633333333333332, "grad_norm": 1.0814345614562626e-06, "kl": 0.04583740234375, "learning_rate": 1.9552783621223437e-05, "loss": 0.0018, "num_tokens": 30508377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 559 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18666666666666668, "grad_norm": 1.5342546930696699e-06, "kl": 0.04217529296875, "learning_rate": 1.954933643605904e-05, "loss": 0.0017, "num_tokens": 30585049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 560 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.187, "grad_norm": 1.0698181540647056e-06, "kl": 0.0458984375, "learning_rate": 1.954587632247732e-05, "loss": 0.0018, "num_tokens": 30659465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 561 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18733333333333332, "grad_norm": 1.5908269688225118e-06, "kl": 0.04595947265625, "learning_rate": 1.954240328516277e-05, "loss": 0.0018, "num_tokens": 30736169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 562 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18766666666666668, "grad_norm": 1.179083255919977e-06, "kl": 0.04815673828125, "learning_rate": 1.9538917328817377e-05, "loss": 0.0019, "num_tokens": 30810521.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 563 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.188, "grad_norm": 1.1328260143272928e-06, "kl": 0.04742431640625, "learning_rate": 1.9535418458160625e-05, "loss": 0.0019, "num_tokens": 30884361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 564 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18833333333333332, "grad_norm": 1.593016236256517e-06, "kl": 0.04571533203125, "learning_rate": 1.9531906677929472e-05, "loss": 0.0018, "num_tokens": 30958601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 565 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18866666666666668, "grad_norm": 0.00036747107515111566, "kl": 0.04644775390625, "learning_rate": 1.9528381992878362e-05, "loss": 0.0019, "num_tokens": 31033145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 566 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.189, "grad_norm": 1.2656346370931715e-06, "kl": 0.04498291015625, "learning_rate": 1.9524844407779208e-05, "loss": 0.0018, "num_tokens": 31109641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 567 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18933333333333333, "grad_norm": 0.00032583263237029314, "kl": 0.04669189453125, "learning_rate": 1.9521293927421388e-05, "loss": 0.0019, "num_tokens": 31186537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 568 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18966666666666668, "grad_norm": 1.0520099067434785e-06, "kl": 0.04327392578125, "learning_rate": 1.951773055661174e-05, "loss": 0.0017, "num_tokens": 31262057.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 569 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19, "grad_norm": 8.720533628547855e-07, "kl": 0.04473876953125, "learning_rate": 1.9514154300174542e-05, "loss": 0.0018, "num_tokens": 31338409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 570 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19033333333333333, "grad_norm": 1.0684104836400365e-06, "kl": 0.042236328125, "learning_rate": 1.9510565162951538e-05, "loss": 0.0017, "num_tokens": 31412489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 571 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19066666666666668, "grad_norm": 1.5664135162296589e-06, "kl": 0.045166015625, "learning_rate": 1.9506963149801894e-05, "loss": 0.0018, "num_tokens": 31491145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 572 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.191, "grad_norm": 1.0929404652415542e-06, "kl": 0.04669189453125, "learning_rate": 1.9503348265602212e-05, "loss": 0.0019, "num_tokens": 31567849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 573 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19133333333333333, "grad_norm": 9.160022500509513e-07, "kl": 0.04510498046875, "learning_rate": 1.9499720515246524e-05, "loss": 0.0018, "num_tokens": 31642729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 574 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19166666666666668, "grad_norm": 7.101962751221436e-07, "kl": 0.0440673828125, "learning_rate": 1.9496079903646282e-05, "loss": 0.0018, "num_tokens": 31715689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 575 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.192, "grad_norm": 1.3223746009316528e-06, "kl": 0.04315185546875, "learning_rate": 1.949242643573034e-05, "loss": 0.0017, "num_tokens": 31790825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 576 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19233333333333333, "grad_norm": 1.117964075092459e-06, "kl": 0.05084228515625, "learning_rate": 1.9488760116444966e-05, "loss": 0.002, "num_tokens": 31868441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 577 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19266666666666668, "grad_norm": 1.1153841796840425e-06, "kl": 0.044189453125, "learning_rate": 1.948508095075383e-05, "loss": 0.0018, "num_tokens": 31943769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 578 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.193, "grad_norm": 0.0003569263790268451, "kl": 0.04486083984375, "learning_rate": 1.9481388943637976e-05, "loss": 0.0018, "num_tokens": 32018985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 579 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19333333333333333, "grad_norm": 7.219896360766143e-07, "kl": 0.04736328125, "learning_rate": 1.947768410009586e-05, "loss": 0.0019, "num_tokens": 32093513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 580 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19366666666666665, "grad_norm": 1.1462284419394564e-06, "kl": 0.04608154296875, "learning_rate": 1.9473966425143292e-05, "loss": 0.0018, "num_tokens": 32168841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 581 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.194, "grad_norm": 1.1973814935117844e-06, "kl": 0.04541015625, "learning_rate": 1.947023592381348e-05, "loss": 0.0018, "num_tokens": 32245561.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 582 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19433333333333333, "grad_norm": 1.0366587730459287e-06, "kl": 0.04290771484375, "learning_rate": 1.9466492601156964e-05, "loss": 0.0017, "num_tokens": 32321769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 583 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19466666666666665, "grad_norm": 1.0237714604954817e-06, "kl": 0.0428466796875, "learning_rate": 1.9462736462241672e-05, "loss": 0.0017, "num_tokens": 32397529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 584 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.195, "grad_norm": 9.8329178399581e-07, "kl": 0.04962158203125, "learning_rate": 1.9458967512152872e-05, "loss": 0.002, "num_tokens": 32472489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 585 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19533333333333333, "grad_norm": 9.641221367928665e-07, "kl": 0.04791259765625, "learning_rate": 1.945518575599317e-05, "loss": 0.0019, "num_tokens": 32547929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 586 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19566666666666666, "grad_norm": 1.0616884082992328e-06, "kl": 0.04296875, "learning_rate": 1.945139119888252e-05, "loss": 0.0017, "num_tokens": 32623689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 587 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.196, "grad_norm": 5.584269615610538e-07, "kl": 0.04595947265625, "learning_rate": 1.9447583845958198e-05, "loss": 0.0018, "num_tokens": 32697689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 588 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19633333333333333, "grad_norm": 0.00040306319715455174, "kl": 0.0452880859375, "learning_rate": 1.944376370237481e-05, "loss": 0.0018, "num_tokens": 32774393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 589 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19666666666666666, "grad_norm": 8.158833679772215e-07, "kl": 0.0484619140625, "learning_rate": 1.9439930773304284e-05, "loss": 0.0019, "num_tokens": 32849049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 590 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.197, "grad_norm": 9.90066382655641e-07, "kl": 0.04595947265625, "learning_rate": 1.9436085063935837e-05, "loss": 0.0018, "num_tokens": 32926345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 591 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19733333333333333, "grad_norm": 5.940618166278e-07, "kl": 0.0469970703125, "learning_rate": 1.943222657947601e-05, "loss": 0.0019, "num_tokens": 33001049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 592 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19766666666666666, "grad_norm": 1.0115753639183822e-06, "kl": 0.0438232421875, "learning_rate": 1.9428355325148632e-05, "loss": 0.0018, "num_tokens": 33076185.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 593 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.198, "grad_norm": 1.0544408723944798e-06, "kl": 0.046142578125, "learning_rate": 1.9424471306194822e-05, "loss": 0.0018, "num_tokens": 33151273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 594 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19833333333333333, "grad_norm": 7.935628332234046e-07, "kl": 0.04339599609375, "learning_rate": 1.942057452787297e-05, "loss": 0.0017, "num_tokens": 33224537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 595 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19866666666666666, "grad_norm": 5.599874839390395e-07, "kl": 0.04681396484375, "learning_rate": 1.9416664995458756e-05, "loss": 0.0019, "num_tokens": 33301929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 596 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.199, "grad_norm": 7.87715464412031e-07, "kl": 0.0439453125, "learning_rate": 1.941274271424512e-05, "loss": 0.0018, "num_tokens": 33379017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 597 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19933333333333333, "grad_norm": 5.585097255789151e-07, "kl": 0.03985595703125, "learning_rate": 1.9408807689542257e-05, "loss": 0.0016, "num_tokens": 33458233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 598 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19966666666666666, "grad_norm": 7.517481890317868e-07, "kl": 0.04473876953125, "learning_rate": 1.9404859926677625e-05, "loss": 0.0018, "num_tokens": 33538089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 599 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2, "grad_norm": 7.894312830103445e-07, "kl": 0.0423583984375, "learning_rate": 1.9400899430995923e-05, "loss": 0.0017, "num_tokens": 33613417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 600 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20033333333333334, "grad_norm": 6.630521625083929e-07, "kl": 0.04644775390625, "learning_rate": 1.9396926207859085e-05, "loss": 0.0019, "num_tokens": 33688537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 601 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20066666666666666, "grad_norm": 9.316294153904892e-07, "kl": 0.052001953125, "learning_rate": 1.9392940262646284e-05, "loss": 0.0021, "num_tokens": 33767225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 602 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.201, "grad_norm": 6.783401431675884e-07, "kl": 0.04730224609375, "learning_rate": 1.9388941600753902e-05, "loss": 0.0019, "num_tokens": 33843817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 603 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20133333333333334, "grad_norm": 0.0003504432097543031, "kl": 0.047607421875, "learning_rate": 1.938493022759556e-05, "loss": 0.0019, "num_tokens": 33918441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 604 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20166666666666666, "grad_norm": 5.354993959372223e-07, "kl": 0.0474853515625, "learning_rate": 1.938090614860207e-05, "loss": 0.0019, "num_tokens": 33992201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 605 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.202, "grad_norm": 5.560214049182832e-07, "kl": 0.0455322265625, "learning_rate": 1.937686936922145e-05, "loss": 0.0018, "num_tokens": 34067289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 606 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20233333333333334, "grad_norm": 8.298476359414053e-07, "kl": 0.04718017578125, "learning_rate": 1.937281989491892e-05, "loss": 0.0019, "num_tokens": 34143113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 607 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20266666666666666, "grad_norm": 6.181532512528065e-07, "kl": 0.04736328125, "learning_rate": 1.936875773117687e-05, "loss": 0.0019, "num_tokens": 34220489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 608 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.203, "grad_norm": 6.222510364750633e-07, "kl": 0.04669189453125, "learning_rate": 1.9364682883494892e-05, "loss": 0.0019, "num_tokens": 34297113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 609 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20333333333333334, "grad_norm": 8.284929435831145e-07, "kl": 0.04608154296875, "learning_rate": 1.9360595357389735e-05, "loss": 0.0018, "num_tokens": 34375673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 610 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20366666666666666, "grad_norm": 7.932925427667215e-07, "kl": 0.045654296875, "learning_rate": 1.9356495158395317e-05, "loss": 0.0018, "num_tokens": 34451785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 611 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.204, "grad_norm": 7.791092571096669e-07, "kl": 0.04620361328125, "learning_rate": 1.9352382292062712e-05, "loss": 0.0018, "num_tokens": 34527769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 612 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20433333333333334, "grad_norm": 3.7999194546500803e-07, "kl": 0.04302978515625, "learning_rate": 1.9348256763960146e-05, "loss": 0.0017, "num_tokens": 34606473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 613 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20466666666666666, "grad_norm": 4.3525051296455786e-07, "kl": 0.0443115234375, "learning_rate": 1.9344118579672987e-05, "loss": 0.0018, "num_tokens": 34681321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 614 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.205, "grad_norm": 6.318659302451124e-07, "kl": 0.0447998046875, "learning_rate": 1.9339967744803735e-05, "loss": 0.0018, "num_tokens": 34757705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 615 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20533333333333334, "grad_norm": 4.347545541349973e-07, "kl": 0.04644775390625, "learning_rate": 1.9335804264972018e-05, "loss": 0.0019, "num_tokens": 34831449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 616 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20566666666666666, "grad_norm": 4.64629408725159e-07, "kl": 0.04742431640625, "learning_rate": 1.9331628145814587e-05, "loss": 0.0019, "num_tokens": 34906185.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 617 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.206, "grad_norm": 5.55207577690453e-07, "kl": 0.047119140625, "learning_rate": 1.93274393929853e-05, "loss": 0.0019, "num_tokens": 34981369.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 618 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20633333333333334, "grad_norm": 6.003366479490069e-07, "kl": 0.04705810546875, "learning_rate": 1.9323238012155125e-05, "loss": 0.0019, "num_tokens": 35058137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 619 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20666666666666667, "grad_norm": 5.862183343197103e-07, "kl": 0.0379638671875, "learning_rate": 1.9319024009012114e-05, "loss": 0.0015, "num_tokens": 35138473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 620 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.207, "grad_norm": 6.453685159613087e-07, "kl": 0.042236328125, "learning_rate": 1.9314797389261426e-05, "loss": 0.0017, "num_tokens": 35214297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 621 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20733333333333334, "grad_norm": 6.374597205649479e-07, "kl": 0.04522705078125, "learning_rate": 1.9310558158625286e-05, "loss": 0.0018, "num_tokens": 35292393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 622 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20766666666666667, "grad_norm": 8.011625709514192e-07, "kl": 0.04840087890625, "learning_rate": 1.9306306322842994e-05, "loss": 0.0019, "num_tokens": 35369033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 623 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.208, "grad_norm": 5.33383285983291e-07, "kl": 0.04681396484375, "learning_rate": 1.930204188767093e-05, "loss": 0.0019, "num_tokens": 35443625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 624 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20833333333333334, "grad_norm": 5.202861075304099e-07, "kl": 0.0457763671875, "learning_rate": 1.9297764858882516e-05, "loss": 0.0018, "num_tokens": 35518489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 625 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20866666666666667, "grad_norm": 5.681910124621936e-07, "kl": 0.04473876953125, "learning_rate": 1.9293475242268224e-05, "loss": 0.0018, "num_tokens": 35592793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 626 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.209, "grad_norm": 3.2985295206344745e-07, "kl": 0.04791259765625, "learning_rate": 1.9289173043635584e-05, "loss": 0.0019, "num_tokens": 35666377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 627 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20933333333333334, "grad_norm": 7.614339097017364e-07, "kl": 0.04608154296875, "learning_rate": 1.9284858268809135e-05, "loss": 0.0018, "num_tokens": 35745065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 628 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20966666666666667, "grad_norm": 7.926134344415914e-07, "kl": 0.046630859375, "learning_rate": 1.928053092363047e-05, "loss": 0.0019, "num_tokens": 35822537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 629 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21, "grad_norm": 0.000319549348205328, "kl": 0.04766845703125, "learning_rate": 1.927619101395818e-05, "loss": 0.0019, "num_tokens": 35904793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 630 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21033333333333334, "grad_norm": 7.020657335488067e-07, "kl": 0.046142578125, "learning_rate": 1.9271838545667876e-05, "loss": 0.0018, "num_tokens": 35981865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 631 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21066666666666667, "grad_norm": 6.815338906562829e-07, "kl": 0.0445556640625, "learning_rate": 1.9267473524652168e-05, "loss": 0.0018, "num_tokens": 36060969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 632 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.211, "grad_norm": 3.966312931424909e-07, "kl": 0.04913330078125, "learning_rate": 1.926309595682066e-05, "loss": 0.002, "num_tokens": 36134665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 633 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21133333333333335, "grad_norm": 6.308768547569343e-07, "kl": 0.04498291015625, "learning_rate": 1.925870584809995e-05, "loss": 0.0018, "num_tokens": 36210713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 634 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21166666666666667, "grad_norm": 5.821686386298097e-07, "kl": 0.0458984375, "learning_rate": 1.9254303204433602e-05, "loss": 0.0018, "num_tokens": 36287177.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 635 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.212, "grad_norm": 4.2193923377453757e-07, "kl": 0.04876708984375, "learning_rate": 1.924988803178216e-05, "loss": 0.002, "num_tokens": 36361977.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 636 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21233333333333335, "grad_norm": 5.425221161203808e-07, "kl": 0.0438232421875, "learning_rate": 1.9245460336123136e-05, "loss": 0.0018, "num_tokens": 36440729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 637 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21266666666666667, "grad_norm": 4.0420098912363756e-07, "kl": 0.04266357421875, "learning_rate": 1.9241020123450972e-05, "loss": 0.0017, "num_tokens": 36516649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 638 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.213, "grad_norm": 8.250989367297734e-07, "kl": 0.04742431640625, "learning_rate": 1.9236567399777086e-05, "loss": 0.0019, "num_tokens": 36594681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 639 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21333333333333335, "grad_norm": 5.732405838898558e-07, "kl": 0.04296875, "learning_rate": 1.923210217112981e-05, "loss": 0.0017, "num_tokens": 36673449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 640 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21366666666666667, "grad_norm": 5.579394155574846e-07, "kl": 0.0426025390625, "learning_rate": 1.9227624443554425e-05, "loss": 0.0017, "num_tokens": 36748649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 641 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.214, "grad_norm": 4.645865772090474e-07, "kl": 0.04852294921875, "learning_rate": 1.9223134223113122e-05, "loss": 0.0019, "num_tokens": 36823129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 642 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21433333333333332, "grad_norm": 3.995492647845822e-07, "kl": 0.0423583984375, "learning_rate": 1.9218631515885007e-05, "loss": 0.0017, "num_tokens": 36897753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 643 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21466666666666667, "grad_norm": 4.308226095872669e-07, "kl": 0.0445556640625, "learning_rate": 1.9214116327966095e-05, "loss": 0.0018, "num_tokens": 36976425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 644 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.215, "grad_norm": 2.6320984147787385e-07, "kl": 0.04473876953125, "learning_rate": 1.9209588665469294e-05, "loss": 0.0018, "num_tokens": 37050441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 645 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21533333333333332, "grad_norm": 4.1645921555755194e-07, "kl": 0.04864501953125, "learning_rate": 1.9205048534524405e-05, "loss": 0.0019, "num_tokens": 37126089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 646 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21566666666666667, "grad_norm": 3.734244558017963e-07, "kl": 0.044189453125, "learning_rate": 1.9200495941278105e-05, "loss": 0.0018, "num_tokens": 37202809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 647 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.216, "grad_norm": 3.4602098253344593e-07, "kl": 0.04730224609375, "learning_rate": 1.9195930891893946e-05, "loss": 0.0019, "num_tokens": 37276393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 648 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21633333333333332, "grad_norm": 3.98334606188655e-07, "kl": 0.0435791015625, "learning_rate": 1.9191353392552346e-05, "loss": 0.0017, "num_tokens": 37353353.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 649 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21666666666666667, "grad_norm": 3.844149318865675e-07, "kl": 0.0479736328125, "learning_rate": 1.9186763449450572e-05, "loss": 0.0019, "num_tokens": 37428585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 650 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.217, "grad_norm": 4.270441422704607e-07, "kl": 0.04534912109375, "learning_rate": 1.9182161068802742e-05, "loss": 0.0018, "num_tokens": 37506489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 651 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21733333333333332, "grad_norm": 6.118355031503597e-07, "kl": 0.04656982421875, "learning_rate": 1.9177546256839814e-05, "loss": 0.0019, "num_tokens": 37582201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 652 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21766666666666667, "grad_norm": 4.588695503571216e-07, "kl": 0.04656982421875, "learning_rate": 1.9172919019809572e-05, "loss": 0.0019, "num_tokens": 37657225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 653 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.218, "grad_norm": 3.8411499758694845e-07, "kl": 0.04534912109375, "learning_rate": 1.9168279363976627e-05, "loss": 0.0018, "num_tokens": 37731577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 654 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21833333333333332, "grad_norm": 4.050346831263596e-07, "kl": 0.046875, "learning_rate": 1.9163627295622397e-05, "loss": 0.0019, "num_tokens": 37811289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 655 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21866666666666668, "grad_norm": 3.476679069081001e-07, "kl": 0.0487060546875, "learning_rate": 1.9158962821045113e-05, "loss": 0.0019, "num_tokens": 37886489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 656 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.219, "grad_norm": 3.4760830658342456e-07, "kl": 0.04766845703125, "learning_rate": 1.9154285946559792e-05, "loss": 0.0019, "num_tokens": 37960617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 657 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21933333333333332, "grad_norm": 3.754953468160238e-07, "kl": 0.0458984375, "learning_rate": 1.914959667849825e-05, "loss": 0.0018, "num_tokens": 38036425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 658 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21966666666666668, "grad_norm": 3.688506637899991e-07, "kl": 0.04632568359375, "learning_rate": 1.9144895023209072e-05, "loss": 0.0019, "num_tokens": 38110089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 659 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22, "grad_norm": 6.699099799334363e-07, "kl": 0.045654296875, "learning_rate": 1.914018098705762e-05, "loss": 0.0018, "num_tokens": 38186473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 660 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22033333333333333, "grad_norm": 7.001960966590559e-07, "kl": 0.0458984375, "learning_rate": 1.913545457642601e-05, "loss": 0.0018, "num_tokens": 38263321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 661 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22066666666666668, "grad_norm": 4.042692864913988e-07, "kl": 0.044921875, "learning_rate": 1.9130715797713123e-05, "loss": 0.0018, "num_tokens": 38339241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 662 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.221, "grad_norm": 6.046190605957236e-07, "kl": 0.05047607421875, "learning_rate": 1.912596465733458e-05, "loss": 0.002, "num_tokens": 38415049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 663 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22133333333333333, "grad_norm": 4.457091051790485e-07, "kl": 0.043701171875, "learning_rate": 1.9121201161722732e-05, "loss": 0.0017, "num_tokens": 38492601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 664 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22166666666666668, "grad_norm": 6.932128258085868e-07, "kl": 0.0479736328125, "learning_rate": 1.911642531732666e-05, "loss": 0.0019, "num_tokens": 38570665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 665 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.222, "grad_norm": 4.7143629444690305e-07, "kl": 0.048583984375, "learning_rate": 1.9111637130612172e-05, "loss": 0.0019, "num_tokens": 38645113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 666 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22233333333333333, "grad_norm": 5.260311013444152e-07, "kl": 0.04644775390625, "learning_rate": 1.910683660806177e-05, "loss": 0.0019, "num_tokens": 38720313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 667 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22266666666666668, "grad_norm": 4.739439418699476e-07, "kl": 0.04656982421875, "learning_rate": 1.9102023756174675e-05, "loss": 0.0019, "num_tokens": 38795657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 668 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.223, "grad_norm": 4.6509964590768504e-07, "kl": 0.044189453125, "learning_rate": 1.9097198581466785e-05, "loss": 0.0018, "num_tokens": 38871081.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 669 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22333333333333333, "grad_norm": 3.4570342677398003e-07, "kl": 0.04705810546875, "learning_rate": 1.9092361090470688e-05, "loss": 0.0019, "num_tokens": 38945561.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 670 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22366666666666668, "grad_norm": 5.26558835645119e-07, "kl": 0.0509033203125, "learning_rate": 1.9087511289735646e-05, "loss": 0.002, "num_tokens": 39022009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 671 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.224, "grad_norm": 4.127924171370978e-07, "kl": 0.04541015625, "learning_rate": 1.9082649185827583e-05, "loss": 0.0018, "num_tokens": 39097977.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 672 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22433333333333333, "grad_norm": 4.78803087844426e-07, "kl": 0.0491943359375, "learning_rate": 1.907777478532909e-05, "loss": 0.002, "num_tokens": 39174041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 673 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22466666666666665, "grad_norm": 3.9622898384550354e-07, "kl": 0.04779052734375, "learning_rate": 1.907288809483939e-05, "loss": 0.0019, "num_tokens": 39247513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.225, "grad_norm": 6.678451427433174e-07, "kl": 0.04833984375, "learning_rate": 1.906798912097436e-05, "loss": 0.0019, "num_tokens": 39324569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 675 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22533333333333333, "grad_norm": 6.836395414211438e-07, "kl": 0.04388427734375, "learning_rate": 1.9063077870366504e-05, "loss": 0.0018, "num_tokens": 39402121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 676 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22566666666666665, "grad_norm": 5.298132919051568e-07, "kl": 0.04498291015625, "learning_rate": 1.9058154349664932e-05, "loss": 0.0018, "num_tokens": 39478441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 677 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.226, "grad_norm": 5.294585889714654e-07, "kl": 0.05047607421875, "learning_rate": 1.9053218565535383e-05, "loss": 0.002, "num_tokens": 39552841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 678 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22633333333333333, "grad_norm": 6.245261943149671e-07, "kl": 0.04461669921875, "learning_rate": 1.9048270524660197e-05, "loss": 0.0018, "num_tokens": 39629593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 679 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22666666666666666, "grad_norm": 6.321162686617754e-07, "kl": 0.0428466796875, "learning_rate": 1.90433102337383e-05, "loss": 0.0017, "num_tokens": 39706569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 680 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.227, "grad_norm": 3.668321539862518e-07, "kl": 0.04547119140625, "learning_rate": 1.9038337699485207e-05, "loss": 0.0018, "num_tokens": 39781769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 681 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22733333333333333, "grad_norm": 5.203608566262119e-07, "kl": 0.04217529296875, "learning_rate": 1.903335292863301e-05, "loss": 0.0017, "num_tokens": 39862761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 682 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22766666666666666, "grad_norm": 3.6629140254262893e-07, "kl": 0.05181884765625, "learning_rate": 1.9028355927930363e-05, "loss": 0.0021, "num_tokens": 39937433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 683 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.228, "grad_norm": 3.0064828138165467e-07, "kl": 0.0489501953125, "learning_rate": 1.9023346704142488e-05, "loss": 0.002, "num_tokens": 40012009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 684 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22833333333333333, "grad_norm": 7.485186870326288e-07, "kl": 0.04705810546875, "learning_rate": 1.901832526405114e-05, "loss": 0.0019, "num_tokens": 40087401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 685 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22866666666666666, "grad_norm": 5.358536441235628e-07, "kl": 0.0478515625, "learning_rate": 1.9013291614454622e-05, "loss": 0.0019, "num_tokens": 40163225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 686 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.229, "grad_norm": 5.794017852167599e-07, "kl": 0.04559326171875, "learning_rate": 1.9008245762167773e-05, "loss": 0.0018, "num_tokens": 40240793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 687 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22933333333333333, "grad_norm": 3.6473832665251393e-07, "kl": 0.044677734375, "learning_rate": 1.9003187714021936e-05, "loss": 0.0018, "num_tokens": 40316537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 688 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22966666666666666, "grad_norm": 7.599930427204526e-07, "kl": 0.044677734375, "learning_rate": 1.8998117476864984e-05, "loss": 0.0018, "num_tokens": 40394793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 689 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23, "grad_norm": 5.503894158209732e-07, "kl": 0.04852294921875, "learning_rate": 1.8993035057561274e-05, "loss": 0.0019, "num_tokens": 40473193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 690 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23033333333333333, "grad_norm": 4.909775270789396e-07, "kl": 0.04400634765625, "learning_rate": 1.8987940462991673e-05, "loss": 0.0018, "num_tokens": 40546409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 691 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23066666666666666, "grad_norm": 5.05828552377352e-07, "kl": 0.0472412109375, "learning_rate": 1.8982833700053518e-05, "loss": 0.0019, "num_tokens": 40622073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 692 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.231, "grad_norm": 2.8664194928751385e-07, "kl": 0.04571533203125, "learning_rate": 1.897771477566063e-05, "loss": 0.0018, "num_tokens": 40696889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 693 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23133333333333334, "grad_norm": 4.128796149416303e-07, "kl": 0.0419921875, "learning_rate": 1.8972583696743284e-05, "loss": 0.0017, "num_tokens": 40778713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 694 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23166666666666666, "grad_norm": 3.268671378009458e-07, "kl": 0.04388427734375, "learning_rate": 1.8967440470248227e-05, "loss": 0.0018, "num_tokens": 40853913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 695 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.232, "grad_norm": 3.5997581449009886e-07, "kl": 0.04669189453125, "learning_rate": 1.8962285103138637e-05, "loss": 0.0019, "num_tokens": 40928441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 696 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23233333333333334, "grad_norm": 3.8002178825990995e-07, "kl": 0.04620361328125, "learning_rate": 1.895711760239413e-05, "loss": 0.0019, "num_tokens": 41007049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 697 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23266666666666666, "grad_norm": 4.90494642235717e-07, "kl": 0.04376220703125, "learning_rate": 1.895193797501076e-05, "loss": 0.0018, "num_tokens": 41081209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 698 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.233, "grad_norm": 4.93644108701119e-07, "kl": 0.046875, "learning_rate": 1.8946746228000987e-05, "loss": 0.0019, "num_tokens": 41158441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 699 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23333333333333334, "grad_norm": 4.896035079582362e-07, "kl": 0.05010986328125, "learning_rate": 1.8941542368393683e-05, "loss": 0.002, "num_tokens": 41232889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 700 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23366666666666666, "grad_norm": 3.914606168109458e-07, "kl": 0.0438232421875, "learning_rate": 1.8936326403234125e-05, "loss": 0.0018, "num_tokens": 41305865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 701 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.234, "grad_norm": 6.543713766404835e-07, "kl": 0.04534912109375, "learning_rate": 1.893109833958397e-05, "loss": 0.0018, "num_tokens": 41381497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 702 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23433333333333334, "grad_norm": 3.5247768437329796e-07, "kl": 0.0440673828125, "learning_rate": 1.892585818452126e-05, "loss": 0.0018, "num_tokens": 41457833.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 703 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23466666666666666, "grad_norm": 5.829222686770663e-07, "kl": 0.045654296875, "learning_rate": 1.8920605945140396e-05, "loss": 0.0018, "num_tokens": 41534841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 704 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.235, "grad_norm": 5.0437063237041e-07, "kl": 0.04449462890625, "learning_rate": 1.8915341628552166e-05, "loss": 0.0018, "num_tokens": 41612153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 705 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23533333333333334, "grad_norm": 6.390648081833206e-07, "kl": 0.04736328125, "learning_rate": 1.891006524188368e-05, "loss": 0.0019, "num_tokens": 41687449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 706 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23566666666666666, "grad_norm": 4.818857064492477e-07, "kl": 0.0482177734375, "learning_rate": 1.8904776792278403e-05, "loss": 0.0019, "num_tokens": 41763625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 707 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.236, "grad_norm": 4.535727669008338e-07, "kl": 0.04840087890625, "learning_rate": 1.889947628689613e-05, "loss": 0.0019, "num_tokens": 41839161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 708 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23633333333333334, "grad_norm": 5.368497113522608e-07, "kl": 0.0477294921875, "learning_rate": 1.889416373291298e-05, "loss": 0.0019, "num_tokens": 41915817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 709 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23666666666666666, "grad_norm": 2.403244536708371e-07, "kl": 0.0430908203125, "learning_rate": 1.888883913752137e-05, "loss": 0.0017, "num_tokens": 41989977.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 710 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.237, "grad_norm": 4.815542524738703e-07, "kl": 0.0452880859375, "learning_rate": 1.8883502507930044e-05, "loss": 0.0018, "num_tokens": 42065769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 711 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23733333333333334, "grad_norm": 4.835127924707194e-07, "kl": 0.04510498046875, "learning_rate": 1.8878153851364013e-05, "loss": 0.0018, "num_tokens": 42139449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 712 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23766666666666666, "grad_norm": 4.935627089253103e-07, "kl": 0.04754638671875, "learning_rate": 1.8872793175064594e-05, "loss": 0.0019, "num_tokens": 42215625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 713 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.238, "grad_norm": 4.2039519598802144e-07, "kl": 0.04962158203125, "learning_rate": 1.886742048628936e-05, "loss": 0.002, "num_tokens": 42291673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 714 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23833333333333334, "grad_norm": 5.261549631541129e-07, "kl": 0.0433349609375, "learning_rate": 1.8862035792312148e-05, "loss": 0.0017, "num_tokens": 42366505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 715 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23866666666666667, "grad_norm": 5.154719247002504e-07, "kl": 0.04608154296875, "learning_rate": 1.8856639100423062e-05, "loss": 0.0018, "num_tokens": 42450521.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 716 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.239, "grad_norm": 6.216779979695275e-07, "kl": 0.04974365234375, "learning_rate": 1.8851230417928433e-05, "loss": 0.002, "num_tokens": 42526553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 717 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23933333333333334, "grad_norm": 4.082999680576904e-07, "kl": 0.04339599609375, "learning_rate": 1.884580975215084e-05, "loss": 0.0017, "num_tokens": 42601593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 718 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23966666666666667, "grad_norm": 3.3806034593908407e-07, "kl": 0.0469970703125, "learning_rate": 1.8840377110429075e-05, "loss": 0.0019, "num_tokens": 42674649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 719 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24, "grad_norm": 1.1858687685162295e-06, "kl": 0.046630859375, "learning_rate": 1.8834932500118148e-05, "loss": 0.0019, "num_tokens": 42752361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 720 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24033333333333334, "grad_norm": 0.00032892401213757694, "kl": 0.0478515625, "learning_rate": 1.8829475928589272e-05, "loss": 0.0019, "num_tokens": 42827353.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 721 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24066666666666667, "grad_norm": 2.776667713533243e-07, "kl": 0.044921875, "learning_rate": 1.8824007403229852e-05, "loss": 0.0018, "num_tokens": 42899593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 722 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.241, "grad_norm": 3.1084536544767616e-07, "kl": 0.04315185546875, "learning_rate": 1.881852693144348e-05, "loss": 0.0017, "num_tokens": 42974265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 723 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24133333333333334, "grad_norm": 3.515064577186422e-07, "kl": 0.04388427734375, "learning_rate": 1.8813034520649923e-05, "loss": 0.0018, "num_tokens": 43048889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 724 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24166666666666667, "grad_norm": 2.918212658187258e-07, "kl": 0.04815673828125, "learning_rate": 1.880753017828511e-05, "loss": 0.0019, "num_tokens": 43121705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 725 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.242, "grad_norm": 3.23695360293641e-07, "kl": 0.04595947265625, "learning_rate": 1.880201391180111e-05, "loss": 0.0018, "num_tokens": 43198377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 726 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24233333333333335, "grad_norm": 3.559672165920347e-07, "kl": 0.04974365234375, "learning_rate": 1.879648572866617e-05, "loss": 0.002, "num_tokens": 43272377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 727 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24266666666666667, "grad_norm": 3.7746602288279973e-07, "kl": 0.04852294921875, "learning_rate": 1.8790945636364628e-05, "loss": 0.0019, "num_tokens": 43347433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 728 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.243, "grad_norm": 3.523225302615174e-07, "kl": 0.04315185546875, "learning_rate": 1.8785393642396976e-05, "loss": 0.0017, "num_tokens": 43422921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 729 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24333333333333335, "grad_norm": 2.891792121317849e-07, "kl": 0.0472412109375, "learning_rate": 1.8779829754279806e-05, "loss": 0.0019, "num_tokens": 43497113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 730 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24366666666666667, "grad_norm": 5.161469402992225e-07, "kl": 0.04510498046875, "learning_rate": 1.877425397954582e-05, "loss": 0.0018, "num_tokens": 43573401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 731 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.244, "grad_norm": 4.558390855891048e-07, "kl": 0.0487060546875, "learning_rate": 1.876866632574381e-05, "loss": 0.0019, "num_tokens": 43649369.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 732 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24433333333333335, "grad_norm": 5.083584255771711e-07, "kl": 0.0450439453125, "learning_rate": 1.8763066800438638e-05, "loss": 0.0018, "num_tokens": 43725257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 733 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24466666666666667, "grad_norm": 3.0682838314533e-07, "kl": 0.04461669921875, "learning_rate": 1.875745541121126e-05, "loss": 0.0018, "num_tokens": 43803545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 734 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.245, "grad_norm": 2.645230097186868e-07, "kl": 0.04833984375, "learning_rate": 1.8751832165658682e-05, "loss": 0.0019, "num_tokens": 43877401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 735 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24533333333333332, "grad_norm": 4.874401042798127e-07, "kl": 0.04931640625, "learning_rate": 1.874619707139396e-05, "loss": 0.002, "num_tokens": 43952441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 736 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24566666666666667, "grad_norm": 4.381076905701775e-07, "kl": 0.044677734375, "learning_rate": 1.8740550136046195e-05, "loss": 0.0018, "num_tokens": 44026809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 737 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.246, "grad_norm": 4.1323556843053666e-07, "kl": 0.04632568359375, "learning_rate": 1.8734891367260528e-05, "loss": 0.0019, "num_tokens": 44103081.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 738 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24633333333333332, "grad_norm": 6.676098109892337e-07, "kl": 0.0484619140625, "learning_rate": 1.8729220772698096e-05, "loss": 0.0019, "num_tokens": 44179401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 739 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24666666666666667, "grad_norm": 4.623036318207596e-07, "kl": 0.045654296875, "learning_rate": 1.872353836003608e-05, "loss": 0.0018, "num_tokens": 44254057.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 740 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.247, "grad_norm": 5.70516192510695e-07, "kl": 0.04852294921875, "learning_rate": 1.8717844136967626e-05, "loss": 0.0019, "num_tokens": 44330217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 741 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24733333333333332, "grad_norm": 3.8613060837633384e-07, "kl": 0.0433349609375, "learning_rate": 1.8712138111201898e-05, "loss": 0.0017, "num_tokens": 44405737.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 742 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24766666666666667, "grad_norm": 4.1050068944059603e-07, "kl": 0.04559326171875, "learning_rate": 1.870642029046402e-05, "loss": 0.0018, "num_tokens": 44480761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 743 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.248, "grad_norm": 5.079453444523097e-07, "kl": 0.0423583984375, "learning_rate": 1.87006906824951e-05, "loss": 0.0017, "num_tokens": 44558377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 744 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24833333333333332, "grad_norm": 3.411510363093839e-07, "kl": 0.0455322265625, "learning_rate": 1.869494929505219e-05, "loss": 0.0018, "num_tokens": 44631993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 745 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24866666666666667, "grad_norm": 3.024068746526609e-07, "kl": 0.04534912109375, "learning_rate": 1.8689196135908303e-05, "loss": 0.0018, "num_tokens": 44705193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 746 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.249, "grad_norm": 3.101951620010368e-07, "kl": 0.04217529296875, "learning_rate": 1.868343121285238e-05, "loss": 0.0017, "num_tokens": 44780217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 747 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24933333333333332, "grad_norm": 3.148875578062871e-07, "kl": 0.04498291015625, "learning_rate": 1.8677654533689287e-05, "loss": 0.0018, "num_tokens": 44855241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 748 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24966666666666668, "grad_norm": 3.4707920804066816e-07, "kl": 0.04534912109375, "learning_rate": 1.8671866106239812e-05, "loss": 0.0018, "num_tokens": 44931225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 749 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25, "grad_norm": 3.905567780293495e-07, "kl": 0.0423583984375, "learning_rate": 1.866606593834065e-05, "loss": 0.0017, "num_tokens": 45008185.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 750 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25033333333333335, "grad_norm": 2.9572044013548293e-07, "kl": 0.04840087890625, "learning_rate": 1.866025403784439e-05, "loss": 0.0019, "num_tokens": 45081529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 751 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25066666666666665, "grad_norm": 2.1811952422012837e-07, "kl": 0.0435791015625, "learning_rate": 1.8654430412619494e-05, "loss": 0.0017, "num_tokens": 45159737.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 752 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.251, "grad_norm": 4.2060253235831624e-07, "kl": 0.04541015625, "learning_rate": 1.8648595070550312e-05, "loss": 0.0018, "num_tokens": 45235785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 753 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25133333333333335, "grad_norm": 5.803672706861107e-07, "kl": 0.0474853515625, "learning_rate": 1.864274801953705e-05, "loss": 0.0019, "num_tokens": 45313529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 754 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25166666666666665, "grad_norm": 3.2705352737139037e-07, "kl": 0.0484619140625, "learning_rate": 1.8636889267495767e-05, "loss": 0.0019, "num_tokens": 45387513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 755 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.252, "grad_norm": 2.806475549732568e-07, "kl": 0.046875, "learning_rate": 1.8631018822358363e-05, "loss": 0.0019, "num_tokens": 45462457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 756 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25233333333333335, "grad_norm": 3.355482647293684e-07, "kl": 0.045654296875, "learning_rate": 1.8625136692072577e-05, "loss": 0.0018, "num_tokens": 45536105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 757 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25266666666666665, "grad_norm": 3.23345375363715e-07, "kl": 0.04541015625, "learning_rate": 1.8619242884601953e-05, "loss": 0.0018, "num_tokens": 45611513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 758 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.253, "grad_norm": 2.710647493131546e-07, "kl": 0.04583740234375, "learning_rate": 1.8613337407925854e-05, "loss": 0.0018, "num_tokens": 45685417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 759 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25333333333333335, "grad_norm": 4.818232355319196e-07, "kl": 0.04400634765625, "learning_rate": 1.860742027003944e-05, "loss": 0.0018, "num_tokens": 45761305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 760 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25366666666666665, "grad_norm": 4.5073670662532095e-07, "kl": 0.04522705078125, "learning_rate": 1.860149147895366e-05, "loss": 0.0018, "num_tokens": 45836361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 761 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.254, "grad_norm": 3.986781109688309e-07, "kl": 0.046142578125, "learning_rate": 1.859555104269523e-05, "loss": 0.0018, "num_tokens": 45909721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 762 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25433333333333336, "grad_norm": 2.3543447014162666e-07, "kl": 0.04534912109375, "learning_rate": 1.8589598969306646e-05, "loss": 0.0018, "num_tokens": 45983065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 763 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25466666666666665, "grad_norm": 4.235543258346297e-07, "kl": 0.04498291015625, "learning_rate": 1.8583635266846155e-05, "loss": 0.0018, "num_tokens": 46059401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 764 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.255, "grad_norm": 5.743987685491447e-07, "kl": 0.0455322265625, "learning_rate": 1.8577659943387737e-05, "loss": 0.0018, "num_tokens": 46135865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 765 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25533333333333336, "grad_norm": 4.641596831334027e-07, "kl": 0.04595947265625, "learning_rate": 1.8571673007021124e-05, "loss": 0.0018, "num_tokens": 46211961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 766 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25566666666666665, "grad_norm": 3.290427912361338e-07, "kl": 0.0447998046875, "learning_rate": 1.8565674465851753e-05, "loss": 0.0018, "num_tokens": 46287641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 767 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.256, "grad_norm": 5.178353035262262e-07, "kl": 0.04522705078125, "learning_rate": 1.8559664328000782e-05, "loss": 0.0018, "num_tokens": 46362937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 768 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25633333333333336, "grad_norm": 4.079249151800468e-07, "kl": 0.040283203125, "learning_rate": 1.855364260160507e-05, "loss": 0.0016, "num_tokens": 46444633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 769 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25666666666666665, "grad_norm": 3.4808999771485105e-07, "kl": 0.04498291015625, "learning_rate": 1.854760929481715e-05, "loss": 0.0018, "num_tokens": 46518649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 770 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.257, "grad_norm": 3.6083434906686307e-07, "kl": 0.0447998046875, "learning_rate": 1.854156441580526e-05, "loss": 0.0018, "num_tokens": 46593257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 771 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25733333333333336, "grad_norm": 2.5221800115105e-07, "kl": 0.04656982421875, "learning_rate": 1.8535507972753275e-05, "loss": 0.0019, "num_tokens": 46667273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 772 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25766666666666665, "grad_norm": 2.962192411359865e-07, "kl": 0.04864501953125, "learning_rate": 1.852943997386075e-05, "loss": 0.0019, "num_tokens": 46741417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 773 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.258, "grad_norm": 3.311577927433973e-07, "kl": 0.0386962890625, "learning_rate": 1.8523360427342877e-05, "loss": 0.0015, "num_tokens": 46818857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 774 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25833333333333336, "grad_norm": 4.447210244507005e-07, "kl": 0.04302978515625, "learning_rate": 1.851726934143048e-05, "loss": 0.0017, "num_tokens": 46896217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 775 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25866666666666666, "grad_norm": 0.00033984804758802056, "kl": 0.04327392578125, "learning_rate": 1.8511166724369997e-05, "loss": 0.0017, "num_tokens": 46972713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 776 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.259, "grad_norm": 3.32584590978513e-07, "kl": 0.0462646484375, "learning_rate": 1.85050525844235e-05, "loss": 0.0019, "num_tokens": 47052249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 777 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25933333333333336, "grad_norm": 3.147251845803112e-07, "kl": 0.04473876953125, "learning_rate": 1.849892692986864e-05, "loss": 0.0018, "num_tokens": 47128025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 778 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25966666666666666, "grad_norm": 4.90878960590635e-07, "kl": 0.04510498046875, "learning_rate": 1.8492789768998668e-05, "loss": 0.0018, "num_tokens": 47205561.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 779 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26, "grad_norm": 3.842663147679559e-07, "kl": 0.044921875, "learning_rate": 1.848664111012241e-05, "loss": 0.0018, "num_tokens": 47280457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 780 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26033333333333336, "grad_norm": 3.071222920425498e-07, "kl": 0.04779052734375, "learning_rate": 1.848048096156426e-05, "loss": 0.0019, "num_tokens": 47356089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 781 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26066666666666666, "grad_norm": 3.0678120310767554e-07, "kl": 0.0452880859375, "learning_rate": 1.8474309331664165e-05, "loss": 0.0018, "num_tokens": 47431497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 782 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.261, "grad_norm": 2.8356168968457496e-07, "kl": 0.04132080078125, "learning_rate": 1.8468126228777617e-05, "loss": 0.0017, "num_tokens": 47507161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 783 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2613333333333333, "grad_norm": 3.666902728127752e-07, "kl": 0.04412841796875, "learning_rate": 1.8461931661275642e-05, "loss": 0.0018, "num_tokens": 47586905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 784 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26166666666666666, "grad_norm": 4.454333293324453e-07, "kl": 0.04132080078125, "learning_rate": 1.8455725637544784e-05, "loss": 0.0017, "num_tokens": 47664025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 785 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.262, "grad_norm": 3.260820164996403e-07, "kl": 0.047607421875, "learning_rate": 1.8449508165987106e-05, "loss": 0.0019, "num_tokens": 47736921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 786 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2623333333333333, "grad_norm": 2.53294899721368e-07, "kl": 0.04412841796875, "learning_rate": 1.8443279255020153e-05, "loss": 0.0018, "num_tokens": 47809481.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 787 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26266666666666666, "grad_norm": 4.612111297319643e-07, "kl": 0.04510498046875, "learning_rate": 1.8437038913076974e-05, "loss": 0.0018, "num_tokens": 47884473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 788 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.263, "grad_norm": 2.8864297973996145e-07, "kl": 0.048828125, "learning_rate": 1.8430787148606087e-05, "loss": 0.002, "num_tokens": 47959449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 789 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2633333333333333, "grad_norm": 2.3924124548102554e-07, "kl": 0.04827880859375, "learning_rate": 1.842452397007148e-05, "loss": 0.0019, "num_tokens": 48033017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 790 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26366666666666666, "grad_norm": 3.323621911022201e-07, "kl": 0.0482177734375, "learning_rate": 1.8418249385952575e-05, "loss": 0.0019, "num_tokens": 48109225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 791 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.264, "grad_norm": 3.8284713355096756e-07, "kl": 0.04345703125, "learning_rate": 1.8411963404744263e-05, "loss": 0.0017, "num_tokens": 48186553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 792 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2643333333333333, "grad_norm": 2.897108402066806e-07, "kl": 0.0440673828125, "learning_rate": 1.8405666034956842e-05, "loss": 0.0018, "num_tokens": 48261433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 793 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26466666666666666, "grad_norm": 2.4835526346578263e-07, "kl": 0.04583740234375, "learning_rate": 1.8399357285116045e-05, "loss": 0.0018, "num_tokens": 48335705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 794 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.265, "grad_norm": 3.303800610865437e-07, "kl": 0.0460205078125, "learning_rate": 1.8393037163763005e-05, "loss": 0.0018, "num_tokens": 48412825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 795 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2653333333333333, "grad_norm": 3.7167754385336593e-07, "kl": 0.04705810546875, "learning_rate": 1.8386705679454243e-05, "loss": 0.0019, "num_tokens": 48488409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 796 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26566666666666666, "grad_norm": 4.7247795009752735e-07, "kl": 0.04736328125, "learning_rate": 1.8380362840761675e-05, "loss": 0.0019, "num_tokens": 48568489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 797 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.266, "grad_norm": 2.2959727630222915e-07, "kl": 0.04705810546875, "learning_rate": 1.8374008656272585e-05, "loss": 0.0019, "num_tokens": 48642249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 798 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2663333333333333, "grad_norm": 3.831698620615498e-07, "kl": 0.04339599609375, "learning_rate": 1.836764313458962e-05, "loss": 0.0017, "num_tokens": 48720409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 799 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26666666666666666, "grad_norm": 4.5236652113089804e-07, "kl": 0.0450439453125, "learning_rate": 1.836126628433077e-05, "loss": 0.0018, "num_tokens": 48795929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 800 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.267, "grad_norm": 4.7561653104821744e-07, "kl": 0.04736328125, "learning_rate": 1.8354878114129368e-05, "loss": 0.0019, "num_tokens": 48874521.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 801 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2673333333333333, "grad_norm": 4.804242621503363e-07, "kl": 0.045654296875, "learning_rate": 1.8348478632634067e-05, "loss": 0.0018, "num_tokens": 48953065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 802 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26766666666666666, "grad_norm": 3.1694065683041117e-07, "kl": 0.043212890625, "learning_rate": 1.8342067848508843e-05, "loss": 0.0017, "num_tokens": 49028649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 803 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.268, "grad_norm": 2.6863898483497906e-07, "kl": 0.04705810546875, "learning_rate": 1.8335645770432963e-05, "loss": 0.0019, "num_tokens": 49102777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 804 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2683333333333333, "grad_norm": 4.76823487360889e-07, "kl": 0.0452880859375, "learning_rate": 1.8329212407100996e-05, "loss": 0.0018, "num_tokens": 49180441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 805 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26866666666666666, "grad_norm": 3.1277591006073635e-07, "kl": 0.04461669921875, "learning_rate": 1.832276776722278e-05, "loss": 0.0018, "num_tokens": 49255897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 806 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.269, "grad_norm": 4.953151346853701e-07, "kl": 0.0472412109375, "learning_rate": 1.831631185952342e-05, "loss": 0.0019, "num_tokens": 49332553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 807 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2693333333333333, "grad_norm": 2.5542902903907816e-07, "kl": 0.04595947265625, "learning_rate": 1.8309844692743283e-05, "loss": 0.0018, "num_tokens": 49407545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 808 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26966666666666667, "grad_norm": 3.5005550103051064e-07, "kl": 0.04656982421875, "learning_rate": 1.8303366275637977e-05, "loss": 0.0019, "num_tokens": 49481577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 809 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.27, "grad_norm": 3.5171862577954016e-07, "kl": 0.04779052734375, "learning_rate": 1.8296876616978337e-05, "loss": 0.0019, "num_tokens": 49556841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 810 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2703333333333333, "grad_norm": 3.392306382465904e-07, "kl": 0.04425048828125, "learning_rate": 1.8290375725550417e-05, "loss": 0.0018, "num_tokens": 49631641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 811 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.27066666666666667, "grad_norm": 2.7070387886851677e-07, "kl": 0.04412841796875, "learning_rate": 1.828386361015549e-05, "loss": 0.0018, "num_tokens": 49705785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 812 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.271, "grad_norm": 2.6789638241098146e-07, "kl": 0.048583984375, "learning_rate": 1.827734027961001e-05, "loss": 0.0019, "num_tokens": 49781337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 813 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2713333333333333, "grad_norm": 1.6448592532469775e-07, "kl": 0.0450439453125, "learning_rate": 1.827080574274562e-05, "loss": 0.0018, "num_tokens": 49861385.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 814 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.27166666666666667, "grad_norm": 3.167813815707632e-07, "kl": 0.04876708984375, "learning_rate": 1.8264260008409138e-05, "loss": 0.0019, "num_tokens": 49935945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 815 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.272, "grad_norm": 2.637715965647658e-07, "kl": 0.04156494140625, "learning_rate": 1.8257703085462542e-05, "loss": 0.0017, "num_tokens": 50017225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 816 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2723333333333333, "grad_norm": 2.5883832677209284e-07, "kl": 0.04510498046875, "learning_rate": 1.8251134982782952e-05, "loss": 0.0018, "num_tokens": 50095513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 817 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.27266666666666667, "grad_norm": 2.8478842750701006e-07, "kl": 0.0426025390625, "learning_rate": 1.8244555709262627e-05, "loss": 0.0017, "num_tokens": 50170729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 818 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.273, "grad_norm": 4.098873489510879e-07, "kl": 0.04779052734375, "learning_rate": 1.823796527380895e-05, "loss": 0.0019, "num_tokens": 50245881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 819 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2733333333333333, "grad_norm": 2.421719784706511e-07, "kl": 0.04608154296875, "learning_rate": 1.8231363685344422e-05, "loss": 0.0018, "num_tokens": 50320265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 820 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.27366666666666667, "grad_norm": 2.761874782208906e-07, "kl": 0.04400634765625, "learning_rate": 1.8224750952806626e-05, "loss": 0.0018, "num_tokens": 50394809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 821 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.274, "grad_norm": 2.2788773890169978e-07, "kl": 0.04498291015625, "learning_rate": 1.8218127085148246e-05, "loss": 0.0018, "num_tokens": 50468281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 822 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2743333333333333, "grad_norm": 3.9760578829373117e-07, "kl": 0.04376220703125, "learning_rate": 1.821149209133704e-05, "loss": 0.0018, "num_tokens": 50544121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 823 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.27466666666666667, "grad_norm": 2.4715049562473723e-07, "kl": 0.04669189453125, "learning_rate": 1.8204845980355834e-05, "loss": 0.0019, "num_tokens": 50618329.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 824 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.275, "grad_norm": 3.326194644159841e-07, "kl": 0.046142578125, "learning_rate": 1.8198188761202487e-05, "loss": 0.0018, "num_tokens": 50693897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 825 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2753333333333333, "grad_norm": 4.2275809164493694e-07, "kl": 0.0460205078125, "learning_rate": 1.819152044288992e-05, "loss": 0.0018, "num_tokens": 50771017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 826 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.27566666666666667, "grad_norm": 4.921398044643865e-07, "kl": 0.0445556640625, "learning_rate": 1.818484103444606e-05, "loss": 0.0018, "num_tokens": 50847577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 827 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.276, "grad_norm": 5.2437275144257e-07, "kl": 0.0478515625, "learning_rate": 1.8178150544913867e-05, "loss": 0.0019, "num_tokens": 50926281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 828 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2763333333333333, "grad_norm": 4.7230884092641645e-07, "kl": 0.04473876953125, "learning_rate": 1.8171448983351284e-05, "loss": 0.0018, "num_tokens": 51000761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 829 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.27666666666666667, "grad_norm": 5.202701913731289e-07, "kl": 0.04638671875, "learning_rate": 1.8164736358831265e-05, "loss": 0.0019, "num_tokens": 51076969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 830 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.277, "grad_norm": 2.214551102497353e-07, "kl": 0.04296875, "learning_rate": 1.8158012680441723e-05, "loss": 0.0017, "num_tokens": 51151049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 831 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2773333333333333, "grad_norm": 1.864429606257545e-07, "kl": 0.041015625, "learning_rate": 1.815127795728554e-05, "loss": 0.0016, "num_tokens": 51224953.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 832 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2776666666666667, "grad_norm": 3.1296372071665246e-07, "kl": 0.04534912109375, "learning_rate": 1.814453219848057e-05, "loss": 0.0018, "num_tokens": 51300777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 833 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.278, "grad_norm": 2.567128376540495e-07, "kl": 0.04315185546875, "learning_rate": 1.813777541315958e-05, "loss": 0.0017, "num_tokens": 51376217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 834 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2783333333333333, "grad_norm": 2.9446914595609996e-07, "kl": 0.04632568359375, "learning_rate": 1.8131007610470278e-05, "loss": 0.0019, "num_tokens": 51452761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 835 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2786666666666667, "grad_norm": 2.92154226144703e-07, "kl": 0.0419921875, "learning_rate": 1.8124228799575295e-05, "loss": 0.0017, "num_tokens": 51527145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 836 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.279, "grad_norm": 4.3799781224151957e-07, "kl": 0.04571533203125, "learning_rate": 1.811743898965215e-05, "loss": 0.0018, "num_tokens": 51602777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 837 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2793333333333333, "grad_norm": 4.4972955492994515e-07, "kl": 0.0440673828125, "learning_rate": 1.8110638189893267e-05, "loss": 0.0018, "num_tokens": 51677993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 838 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2796666666666667, "grad_norm": 2.8373688110150397e-07, "kl": 0.04547119140625, "learning_rate": 1.8103826409505944e-05, "loss": 0.0018, "num_tokens": 51752985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 839 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.28, "grad_norm": 3.949029121486092e-07, "kl": 0.04632568359375, "learning_rate": 1.8097003657712343e-05, "loss": 0.0019, "num_tokens": 51827497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 840 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2803333333333333, "grad_norm": 2.2465060567355977e-07, "kl": 0.04547119140625, "learning_rate": 1.8090169943749477e-05, "loss": 0.0018, "num_tokens": 51901081.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 841 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2806666666666667, "grad_norm": 3.431296988765098e-07, "kl": 0.0439453125, "learning_rate": 1.8083325276869207e-05, "loss": 0.0018, "num_tokens": 51975625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 842 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.281, "grad_norm": 2.854775971172785e-07, "kl": 0.0416259765625, "learning_rate": 1.807646966633822e-05, "loss": 0.0017, "num_tokens": 52051049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 843 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2813333333333333, "grad_norm": 1.9648739169042528e-07, "kl": 0.0433349609375, "learning_rate": 1.806960312143802e-05, "loss": 0.0017, "num_tokens": 52125241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 844 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2816666666666667, "grad_norm": 2.4585619939898606e-07, "kl": 0.04327392578125, "learning_rate": 1.8062725651464913e-05, "loss": 0.0017, "num_tokens": 52200409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 845 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.282, "grad_norm": 3.198763351974776e-07, "kl": 0.04315185546875, "learning_rate": 1.8055837265729996e-05, "loss": 0.0017, "num_tokens": 52274393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 846 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2823333333333333, "grad_norm": 3.0104936854513653e-07, "kl": 0.0465087890625, "learning_rate": 1.804893797355914e-05, "loss": 0.0019, "num_tokens": 52347929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 847 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2826666666666667, "grad_norm": 2.0402315215051203e-07, "kl": 0.04656982421875, "learning_rate": 1.8042027784292998e-05, "loss": 0.0019, "num_tokens": 52423001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 848 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.283, "grad_norm": 2.624065018608235e-07, "kl": 0.04534912109375, "learning_rate": 1.8035106707286957e-05, "loss": 0.0018, "num_tokens": 52496521.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 849 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2833333333333333, "grad_norm": 2.976613302507758e-07, "kl": 0.0447998046875, "learning_rate": 1.8028174751911147e-05, "loss": 0.0018, "num_tokens": 52571577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 850 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2836666666666667, "grad_norm": 3.7486356063709536e-07, "kl": 0.047607421875, "learning_rate": 1.802123192755044e-05, "loss": 0.0019, "num_tokens": 52645721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 851 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.284, "grad_norm": 2.776278051896952e-07, "kl": 0.0455322265625, "learning_rate": 1.8014278243604407e-05, "loss": 0.0018, "num_tokens": 52719785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 852 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2843333333333333, "grad_norm": 3.085508240019408e-07, "kl": 0.04376220703125, "learning_rate": 1.8007313709487334e-05, "loss": 0.0017, "num_tokens": 52795209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 853 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2846666666666667, "grad_norm": 5.131801117386203e-07, "kl": 0.0484619140625, "learning_rate": 1.800033833462819e-05, "loss": 0.0019, "num_tokens": 52870553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 854 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.285, "grad_norm": 5.162467004993232e-07, "kl": 0.04742431640625, "learning_rate": 1.7993352128470617e-05, "loss": 0.0019, "num_tokens": 52946249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 855 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2853333333333333, "grad_norm": 3.6292138361204707e-07, "kl": 0.045166015625, "learning_rate": 1.798635510047293e-05, "loss": 0.0018, "num_tokens": 53022265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 856 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2856666666666667, "grad_norm": 2.2523936138441059e-07, "kl": 0.04351806640625, "learning_rate": 1.7979347260108088e-05, "loss": 0.0017, "num_tokens": 53095801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 857 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.286, "grad_norm": 2.972224422137515e-07, "kl": 0.04571533203125, "learning_rate": 1.797232861686369e-05, "loss": 0.0018, "num_tokens": 53171497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 858 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.28633333333333333, "grad_norm": 3.185786567883042e-07, "kl": 0.04144287109375, "learning_rate": 1.7965299180241963e-05, "loss": 0.0017, "num_tokens": 53246665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 859 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2866666666666667, "grad_norm": 3.81515633307572e-07, "kl": 0.04229736328125, "learning_rate": 1.7958258959759747e-05, "loss": 0.0017, "num_tokens": 53321897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 860 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.287, "grad_norm": 4.322461109040887e-07, "kl": 0.04693603515625, "learning_rate": 1.795120796494848e-05, "loss": 0.0019, "num_tokens": 53398409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 861 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.28733333333333333, "grad_norm": 5.812352696921153e-07, "kl": 0.0474853515625, "learning_rate": 1.7944146205354182e-05, "loss": 0.0019, "num_tokens": 53475929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 862 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2876666666666667, "grad_norm": 3.7971406641190697e-07, "kl": 0.04779052734375, "learning_rate": 1.793707369053746e-05, "loss": 0.0019, "num_tokens": 53552137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 863 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.288, "grad_norm": 2.2259139598190814e-07, "kl": 0.0406494140625, "learning_rate": 1.7929990430073463e-05, "loss": 0.0016, "num_tokens": 53627721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 864 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.28833333333333333, "grad_norm": 4.0882432017497194e-07, "kl": 0.047119140625, "learning_rate": 1.792289643355191e-05, "loss": 0.0019, "num_tokens": 53704137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 865 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2886666666666667, "grad_norm": 2.311708868774076e-07, "kl": 0.04638671875, "learning_rate": 1.7915791710577035e-05, "loss": 0.0019, "num_tokens": 53778377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 866 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.289, "grad_norm": 4.044439094741392e-07, "kl": 0.04559326171875, "learning_rate": 1.7908676270767608e-05, "loss": 0.0018, "num_tokens": 53856073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 867 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.28933333333333333, "grad_norm": 2.2255395037973358e-07, "kl": 0.04461669921875, "learning_rate": 1.7901550123756906e-05, "loss": 0.0018, "num_tokens": 53930841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 868 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2896666666666667, "grad_norm": 6.428173833228357e-07, "kl": 0.04754638671875, "learning_rate": 1.7894413279192693e-05, "loss": 0.0019, "num_tokens": 54009897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 869 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.29, "grad_norm": 3.9727933653921355e-07, "kl": 0.04168701171875, "learning_rate": 1.7887265746737224e-05, "loss": 0.0017, "num_tokens": 54084233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 870 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.29033333333333333, "grad_norm": 2.666711793608556e-07, "kl": 0.0462646484375, "learning_rate": 1.788010753606722e-05, "loss": 0.0018, "num_tokens": 54159289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 871 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2906666666666667, "grad_norm": 3.67074449059146e-07, "kl": 0.04736328125, "learning_rate": 1.7872938656873864e-05, "loss": 0.0019, "num_tokens": 54236153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 872 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.291, "grad_norm": 3.2913288805502816e-07, "kl": 0.04962158203125, "learning_rate": 1.7865759118862784e-05, "loss": 0.002, "num_tokens": 54310169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 873 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.29133333333333333, "grad_norm": 3.2400063787463296e-07, "kl": 0.04327392578125, "learning_rate": 1.785856893175402e-05, "loss": 0.0017, "num_tokens": 54384985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 874 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2916666666666667, "grad_norm": 3.641858370428963e-07, "kl": 0.0450439453125, "learning_rate": 1.7851368105282054e-05, "loss": 0.0018, "num_tokens": 54464265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 875 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.292, "grad_norm": 4.1312321741315827e-07, "kl": 0.03985595703125, "learning_rate": 1.784415664919576e-05, "loss": 0.0016, "num_tokens": 54541881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 876 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.29233333333333333, "grad_norm": 2.482996137587179e-07, "kl": 0.048095703125, "learning_rate": 1.78369345732584e-05, "loss": 0.0019, "num_tokens": 54616297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 877 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2926666666666667, "grad_norm": 3.0637730219496007e-07, "kl": 0.046875, "learning_rate": 1.7829701887247618e-05, "loss": 0.0019, "num_tokens": 54692985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 878 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.293, "grad_norm": 3.00032922950777e-07, "kl": 0.04827880859375, "learning_rate": 1.7822458600955432e-05, "loss": 0.0019, "num_tokens": 54768745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 879 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.29333333333333333, "grad_norm": 3.8900296317478933e-07, "kl": 0.04766845703125, "learning_rate": 1.781520472418819e-05, "loss": 0.0019, "num_tokens": 54845465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 880 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2936666666666667, "grad_norm": 4.389564765006071e-07, "kl": 0.04437255859375, "learning_rate": 1.7807940266766595e-05, "loss": 0.0018, "num_tokens": 54922169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 881 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.294, "grad_norm": 2.6403799324725696e-07, "kl": 0.0447998046875, "learning_rate": 1.780066523852567e-05, "loss": 0.0018, "num_tokens": 54995785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 882 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.29433333333333334, "grad_norm": 2.7337617325429164e-07, "kl": 0.04443359375, "learning_rate": 1.7793379649314743e-05, "loss": 0.0018, "num_tokens": 55073961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 883 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2946666666666667, "grad_norm": 6.265134970817599e-07, "kl": 0.04644775390625, "learning_rate": 1.7786083508997452e-05, "loss": 0.0019, "num_tokens": 55152153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 884 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.295, "grad_norm": 5.472901420944254e-07, "kl": 0.0482177734375, "learning_rate": 1.7778776827451715e-05, "loss": 0.0019, "num_tokens": 55228553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 885 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.29533333333333334, "grad_norm": 2.1152351337150321e-07, "kl": 0.04718017578125, "learning_rate": 1.777145961456971e-05, "loss": 0.0019, "num_tokens": 55302281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 886 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2956666666666667, "grad_norm": 2.692565601591923e-07, "kl": 0.0491943359375, "learning_rate": 1.7764131880257892e-05, "loss": 0.002, "num_tokens": 55377193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 887 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.296, "grad_norm": 3.8602908603024844e-07, "kl": 0.04852294921875, "learning_rate": 1.7756793634436947e-05, "loss": 0.0019, "num_tokens": 55452937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 888 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.29633333333333334, "grad_norm": 2.8668037543866376e-07, "kl": 0.04498291015625, "learning_rate": 1.7749444887041797e-05, "loss": 0.0018, "num_tokens": 55527161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 889 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2966666666666667, "grad_norm": 4.927192094328348e-07, "kl": 0.0438232421875, "learning_rate": 1.774208564802158e-05, "loss": 0.0018, "num_tokens": 55603497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 890 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.297, "grad_norm": 2.734639963364316e-07, "kl": 0.04498291015625, "learning_rate": 1.7734715927339642e-05, "loss": 0.0018, "num_tokens": 55678265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 891 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.29733333333333334, "grad_norm": 3.410806357351248e-07, "kl": 0.04425048828125, "learning_rate": 1.7727335734973512e-05, "loss": 0.0018, "num_tokens": 55753465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 892 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2976666666666667, "grad_norm": 3.435504822846269e-07, "kl": 0.04461669921875, "learning_rate": 1.7719945080914902e-05, "loss": 0.0018, "num_tokens": 55829113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 893 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.298, "grad_norm": 2.851470526366029e-07, "kl": 0.045654296875, "learning_rate": 1.7712543975169687e-05, "loss": 0.0018, "num_tokens": 55903609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 894 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.29833333333333334, "grad_norm": 3.106248982476245e-07, "kl": 0.04766845703125, "learning_rate": 1.7705132427757895e-05, "loss": 0.0019, "num_tokens": 55978201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 895 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2986666666666667, "grad_norm": 4.185882573892741e-07, "kl": 0.04412841796875, "learning_rate": 1.769771044871368e-05, "loss": 0.0018, "num_tokens": 56053929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 896 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.299, "grad_norm": 4.752161260057619e-07, "kl": 0.048095703125, "learning_rate": 1.7690278048085327e-05, "loss": 0.0019, "num_tokens": 56132329.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 897 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.29933333333333334, "grad_norm": 1.4269281223278085e-07, "kl": 0.04473876953125, "learning_rate": 1.7682835235935236e-05, "loss": 0.0018, "num_tokens": 56205241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 898 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2996666666666667, "grad_norm": 2.60033175436547e-07, "kl": 0.0484619140625, "learning_rate": 1.767538202233989e-05, "loss": 0.0019, "num_tokens": 56280233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 899 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3, "grad_norm": 4.6069982317931135e-07, "kl": 0.04437255859375, "learning_rate": 1.7667918417389857e-05, "loss": 0.0018, "num_tokens": 56357945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 900 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30033333333333334, "grad_norm": 5.101852025291009e-07, "kl": 0.04656982421875, "learning_rate": 1.766044443118978e-05, "loss": 0.0019, "num_tokens": 56436169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 901 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3006666666666667, "grad_norm": 3.986527872257284e-07, "kl": 0.04510498046875, "learning_rate": 1.765296007385836e-05, "loss": 0.0018, "num_tokens": 56511657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 902 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.301, "grad_norm": 4.7448915552195103e-07, "kl": 0.0465087890625, "learning_rate": 1.7645465355528317e-05, "loss": 0.0019, "num_tokens": 56589433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 903 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30133333333333334, "grad_norm": 3.2258756732517213e-07, "kl": 0.0443115234375, "learning_rate": 1.7637960286346423e-05, "loss": 0.0018, "num_tokens": 56664777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 904 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3016666666666667, "grad_norm": 3.5260444519735756e-07, "kl": 0.0467529296875, "learning_rate": 1.763044487647345e-05, "loss": 0.0019, "num_tokens": 56739961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 905 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.302, "grad_norm": 3.680910651837621e-07, "kl": 0.0440673828125, "learning_rate": 1.7622919136084183e-05, "loss": 0.0018, "num_tokens": 56816809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 906 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30233333333333334, "grad_norm": 2.2457147963450552e-07, "kl": 0.0439453125, "learning_rate": 1.761538307536737e-05, "loss": 0.0018, "num_tokens": 56892473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 907 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30266666666666664, "grad_norm": 4.272202147603821e-07, "kl": 0.04364013671875, "learning_rate": 1.760783670452575e-05, "loss": 0.0017, "num_tokens": 56970425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 908 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.303, "grad_norm": 2.722154874845728e-07, "kl": 0.04241943359375, "learning_rate": 1.7600280033776018e-05, "loss": 0.0017, "num_tokens": 57045177.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 909 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30333333333333334, "grad_norm": 4.124611052702676e-07, "kl": 0.04315185546875, "learning_rate": 1.759271307334881e-05, "loss": 0.0017, "num_tokens": 57122601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 910 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30366666666666664, "grad_norm": 2.497382070032472e-07, "kl": 0.0465087890625, "learning_rate": 1.7585135833488692e-05, "loss": 0.0019, "num_tokens": 57196281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 911 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.304, "grad_norm": 3.360646587680094e-07, "kl": 0.041259765625, "learning_rate": 1.7577548324454148e-05, "loss": 0.0017, "num_tokens": 57274073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 912 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30433333333333334, "grad_norm": 3.755442037345347e-07, "kl": 0.0496826171875, "learning_rate": 1.7569950556517566e-05, "loss": 0.002, "num_tokens": 57351017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 913 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30466666666666664, "grad_norm": 3.8886116726644104e-07, "kl": 0.04388427734375, "learning_rate": 1.7562342539965223e-05, "loss": 0.0018, "num_tokens": 57425753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 914 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.305, "grad_norm": 2.884500815980573e-07, "kl": 0.04595947265625, "learning_rate": 1.7554724285097272e-05, "loss": 0.0018, "num_tokens": 57500313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 915 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30533333333333335, "grad_norm": 3.6437225503505033e-07, "kl": 0.04486083984375, "learning_rate": 1.7547095802227723e-05, "loss": 0.0018, "num_tokens": 57576313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 916 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30566666666666664, "grad_norm": 4.301882370327803e-07, "kl": 0.04559326171875, "learning_rate": 1.7539457101684434e-05, "loss": 0.0018, "num_tokens": 57653193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 917 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.306, "grad_norm": 3.6725342056342924e-07, "kl": 0.04827880859375, "learning_rate": 1.7531808193809106e-05, "loss": 0.0019, "num_tokens": 57729689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 918 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30633333333333335, "grad_norm": 2.8100902227379265e-07, "kl": 0.04522705078125, "learning_rate": 1.7524149088957244e-05, "loss": 0.0018, "num_tokens": 57805353.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 919 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30666666666666664, "grad_norm": 4.2409971001688973e-07, "kl": 0.0440673828125, "learning_rate": 1.7516479797498172e-05, "loss": 0.0018, "num_tokens": 57881577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 920 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.307, "grad_norm": 2.477618465945852e-07, "kl": 0.04925537109375, "learning_rate": 1.7508800329814993e-05, "loss": 0.002, "num_tokens": 57955225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 921 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30733333333333335, "grad_norm": 3.451582983871049e-07, "kl": 0.0452880859375, "learning_rate": 1.7501110696304598e-05, "loss": 0.0018, "num_tokens": 58031321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 922 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30766666666666664, "grad_norm": 3.8743954178244167e-07, "kl": 0.04351806640625, "learning_rate": 1.749341090737763e-05, "loss": 0.0017, "num_tokens": 58106921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 923 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.308, "grad_norm": 3.9689820141575183e-07, "kl": 0.04718017578125, "learning_rate": 1.7485700973458494e-05, "loss": 0.0019, "num_tokens": 58182617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 924 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30833333333333335, "grad_norm": 6.127338565420359e-07, "kl": 0.0474853515625, "learning_rate": 1.747798090498532e-05, "loss": 0.0019, "num_tokens": 58261289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 925 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30866666666666664, "grad_norm": 3.2155134022104903e-07, "kl": 0.04595947265625, "learning_rate": 1.7470250712409963e-05, "loss": 0.0018, "num_tokens": 58337369.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 926 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.309, "grad_norm": 5.713849873245636e-07, "kl": 0.04425048828125, "learning_rate": 1.746251040619798e-05, "loss": 0.0018, "num_tokens": 58418217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 927 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30933333333333335, "grad_norm": 3.990920447449753e-07, "kl": 0.0421142578125, "learning_rate": 1.7454759996828622e-05, "loss": 0.0017, "num_tokens": 58495241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 928 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30966666666666665, "grad_norm": 2.7729507223739347e-07, "kl": 0.04541015625, "learning_rate": 1.744699949479483e-05, "loss": 0.0018, "num_tokens": 58569657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 929 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31, "grad_norm": 4.045870127811213e-07, "kl": 0.04571533203125, "learning_rate": 1.7439228910603184e-05, "loss": 0.0018, "num_tokens": 58645721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 930 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31033333333333335, "grad_norm": 5.207534172768646e-07, "kl": 0.04290771484375, "learning_rate": 1.7431448254773943e-05, "loss": 0.0017, "num_tokens": 58722697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 931 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31066666666666665, "grad_norm": 3.929250738110568e-07, "kl": 0.0489501953125, "learning_rate": 1.7423657537840978e-05, "loss": 0.002, "num_tokens": 58797369.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 932 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.311, "grad_norm": 3.662092922240845e-07, "kl": 0.0447998046875, "learning_rate": 1.7415856770351797e-05, "loss": 0.0018, "num_tokens": 58874569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 933 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31133333333333335, "grad_norm": 3.5038416967836383e-07, "kl": 0.0496826171875, "learning_rate": 1.74080459628675e-05, "loss": 0.002, "num_tokens": 58950345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 934 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31166666666666665, "grad_norm": 3.007642135344213e-07, "kl": 0.04705810546875, "learning_rate": 1.7400225125962796e-05, "loss": 0.0019, "num_tokens": 59024313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 935 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.312, "grad_norm": 4.5071922727402125e-07, "kl": 0.0450439453125, "learning_rate": 1.739239427022596e-05, "loss": 0.0018, "num_tokens": 59099897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 936 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31233333333333335, "grad_norm": 0.00034065076033584774, "kl": 0.045166015625, "learning_rate": 1.7384553406258842e-05, "loss": 0.0018, "num_tokens": 59180329.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 937 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31266666666666665, "grad_norm": 5.446334512271278e-07, "kl": 0.04754638671875, "learning_rate": 1.7376702544676823e-05, "loss": 0.0019, "num_tokens": 59256793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 938 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.313, "grad_norm": 3.392544840608025e-07, "kl": 0.04736328125, "learning_rate": 1.736884169610884e-05, "loss": 0.0019, "num_tokens": 59331817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 939 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31333333333333335, "grad_norm": 2.0961358870863478e-07, "kl": 0.0478515625, "learning_rate": 1.7360970871197347e-05, "loss": 0.0019, "num_tokens": 59406393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 940 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31366666666666665, "grad_norm": 3.587256571790931e-07, "kl": 0.04534912109375, "learning_rate": 1.735309008059829e-05, "loss": 0.0018, "num_tokens": 59482473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 941 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.314, "grad_norm": 3.310771319320338e-07, "kl": 0.0467529296875, "learning_rate": 1.734519933498112e-05, "loss": 0.0019, "num_tokens": 59558041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 942 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31433333333333335, "grad_norm": 3.3980271041400556e-07, "kl": 0.04425048828125, "learning_rate": 1.7337298645028764e-05, "loss": 0.0018, "num_tokens": 59634313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 943 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31466666666666665, "grad_norm": 5.596580194833223e-07, "kl": 0.04815673828125, "learning_rate": 1.7329388021437615e-05, "loss": 0.0019, "num_tokens": 59712441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 944 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.315, "grad_norm": 0.0003744710411410779, "kl": 0.04437255859375, "learning_rate": 1.7321467474917502e-05, "loss": 0.0018, "num_tokens": 59788585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 945 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31533333333333335, "grad_norm": 4.3050246745224285e-07, "kl": 0.0472412109375, "learning_rate": 1.7313537016191706e-05, "loss": 0.0019, "num_tokens": 59870137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 946 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31566666666666665, "grad_norm": 3.356896343120752e-07, "kl": 0.0452880859375, "learning_rate": 1.7305596655996916e-05, "loss": 0.0018, "num_tokens": 59949209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 947 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.316, "grad_norm": 1.6127195578974352e-07, "kl": 0.0400390625, "learning_rate": 1.729764640508322e-05, "loss": 0.0016, "num_tokens": 60023209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 948 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31633333333333336, "grad_norm": 2.6903279604084673e-07, "kl": 0.04583740234375, "learning_rate": 1.7289686274214116e-05, "loss": 0.0018, "num_tokens": 60097129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 949 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31666666666666665, "grad_norm": 2.99013805715731e-07, "kl": 0.04278564453125, "learning_rate": 1.7281716274166464e-05, "loss": 0.0017, "num_tokens": 60173849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 950 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.317, "grad_norm": 2.4149562705133576e-07, "kl": 0.04486083984375, "learning_rate": 1.7273736415730488e-05, "loss": 0.0018, "num_tokens": 60248857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 951 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31733333333333336, "grad_norm": 3.660021548057557e-07, "kl": 0.04833984375, "learning_rate": 1.7265746709709762e-05, "loss": 0.0019, "num_tokens": 60325145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 952 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31766666666666665, "grad_norm": 1.8270475266035646e-07, "kl": 0.047119140625, "learning_rate": 1.7257747166921186e-05, "loss": 0.0019, "num_tokens": 60399081.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 953 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.318, "grad_norm": 1.8213741270756145e-07, "kl": 0.044921875, "learning_rate": 1.7249737798194982e-05, "loss": 0.0018, "num_tokens": 60479017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 954 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31833333333333336, "grad_norm": 1.6144012704444322e-07, "kl": 0.04534912109375, "learning_rate": 1.7241718614374678e-05, "loss": 0.0018, "num_tokens": 60554249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 955 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31866666666666665, "grad_norm": 2.262213882886499e-07, "kl": 0.0458984375, "learning_rate": 1.723368962631708e-05, "loss": 0.0018, "num_tokens": 60627545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 956 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.319, "grad_norm": 1.1377928643696578e-07, "kl": 0.03997802734375, "learning_rate": 1.722565084489228e-05, "loss": 0.0016, "num_tokens": 60701801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 957 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31933333333333336, "grad_norm": 2.4783636831671174e-07, "kl": 0.048583984375, "learning_rate": 1.7217602280983622e-05, "loss": 0.0019, "num_tokens": 60776873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 958 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.31966666666666665, "grad_norm": 3.2239969982583716e-07, "kl": 0.04345703125, "learning_rate": 1.7209543945487696e-05, "loss": 0.0017, "num_tokens": 60855401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 959 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32, "grad_norm": 2.5958357241506747e-07, "kl": 0.0455322265625, "learning_rate": 1.720147584931431e-05, "loss": 0.0018, "num_tokens": 60932073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 960 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32033333333333336, "grad_norm": 2.2727870430117036e-07, "kl": 0.04559326171875, "learning_rate": 1.7193398003386514e-05, "loss": 0.0018, "num_tokens": 61007337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 961 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32066666666666666, "grad_norm": 1.5408605236189032e-07, "kl": 0.04693603515625, "learning_rate": 1.7185310418640525e-05, "loss": 0.0019, "num_tokens": 61081529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 962 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.321, "grad_norm": 2.603428583825007e-07, "kl": 0.04425048828125, "learning_rate": 1.7177213106025768e-05, "loss": 0.0018, "num_tokens": 61156937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 963 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32133333333333336, "grad_norm": 2.9015737368354166e-07, "kl": 0.049072265625, "learning_rate": 1.716910607650483e-05, "loss": 0.002, "num_tokens": 61233257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 964 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32166666666666666, "grad_norm": 4.746328841065406e-07, "kl": 0.04620361328125, "learning_rate": 1.716098934105345e-05, "loss": 0.0018, "num_tokens": 61311097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 965 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.322, "grad_norm": 2.7876174613084004e-07, "kl": 0.046875, "learning_rate": 1.7152862910660516e-05, "loss": 0.0019, "num_tokens": 61388057.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 966 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32233333333333336, "grad_norm": 3.3170377378155536e-07, "kl": 0.045166015625, "learning_rate": 1.7144726796328034e-05, "loss": 0.0018, "num_tokens": 61463945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 967 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32266666666666666, "grad_norm": 2.6486006277082197e-07, "kl": 0.04180908203125, "learning_rate": 1.7136581009071126e-05, "loss": 0.0017, "num_tokens": 61545737.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 968 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.323, "grad_norm": 2.37801714320085e-07, "kl": 0.04315185546875, "learning_rate": 1.7128425559918006e-05, "loss": 0.0017, "num_tokens": 61621945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 969 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3233333333333333, "grad_norm": 1.7786433659239265e-07, "kl": 0.0479736328125, "learning_rate": 1.712026045990997e-05, "loss": 0.0019, "num_tokens": 61697177.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 970 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32366666666666666, "grad_norm": 1.7683585440408933e-07, "kl": 0.04315185546875, "learning_rate": 1.711208572010137e-05, "loss": 0.0017, "num_tokens": 61770985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 971 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.324, "grad_norm": 2.1341801925700565e-07, "kl": 0.0411376953125, "learning_rate": 1.710390135155964e-05, "loss": 0.0016, "num_tokens": 61846889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 972 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3243333333333333, "grad_norm": 2.0760180063916778e-07, "kl": 0.0438232421875, "learning_rate": 1.709570736536521e-05, "loss": 0.0018, "num_tokens": 61921529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 973 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32466666666666666, "grad_norm": 2.2944283273318433e-07, "kl": 0.04364013671875, "learning_rate": 1.708750377261156e-05, "loss": 0.0017, "num_tokens": 61998233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 974 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.325, "grad_norm": 1.726755840536498e-07, "kl": 0.04791259765625, "learning_rate": 1.7079290584405158e-05, "loss": 0.0019, "num_tokens": 62073081.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 975 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3253333333333333, "grad_norm": 3.4572346407912846e-07, "kl": 0.04595947265625, "learning_rate": 1.7071067811865477e-05, "loss": 0.0018, "num_tokens": 62151593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 976 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32566666666666666, "grad_norm": 3.7549895637312147e-07, "kl": 0.0467529296875, "learning_rate": 1.7062835466124953e-05, "loss": 0.0019, "num_tokens": 62228265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 977 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.326, "grad_norm": 1.4343746101985744e-07, "kl": 0.0455322265625, "learning_rate": 1.7054593558328996e-05, "loss": 0.0018, "num_tokens": 62300985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 978 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3263333333333333, "grad_norm": 2.116904624926974e-07, "kl": 0.04571533203125, "learning_rate": 1.7046342099635948e-05, "loss": 0.0018, "num_tokens": 62376233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 979 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32666666666666666, "grad_norm": 3.3913011066033505e-07, "kl": 0.04510498046875, "learning_rate": 1.7038081101217093e-05, "loss": 0.0018, "num_tokens": 62452809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 980 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.327, "grad_norm": 1.353988920982374e-07, "kl": 0.04315185546875, "learning_rate": 1.702981057425662e-05, "loss": 0.0017, "num_tokens": 62527209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 981 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3273333333333333, "grad_norm": 2.3559587702948193e-07, "kl": 0.04425048828125, "learning_rate": 1.7021530529951627e-05, "loss": 0.0018, "num_tokens": 62602617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 982 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32766666666666666, "grad_norm": 1.7213457681464206e-07, "kl": 0.04522705078125, "learning_rate": 1.701324097951209e-05, "loss": 0.0018, "num_tokens": 62678505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 983 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.328, "grad_norm": 3.7311707501430647e-07, "kl": 0.04547119140625, "learning_rate": 1.7004941934160866e-05, "loss": 0.0018, "num_tokens": 62756361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 984 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3283333333333333, "grad_norm": 3.233733139040851e-07, "kl": 0.0484619140625, "learning_rate": 1.6996633405133656e-05, "loss": 0.0019, "num_tokens": 62833705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 985 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32866666666666666, "grad_norm": 2.3835906404201523e-07, "kl": 0.043212890625, "learning_rate": 1.6988315403679e-05, "loss": 0.0017, "num_tokens": 62912665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 986 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.329, "grad_norm": 2.4506357476639096e-07, "kl": 0.04803466796875, "learning_rate": 1.6979987941058274e-05, "loss": 0.0019, "num_tokens": 62987369.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 987 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3293333333333333, "grad_norm": 1.1797215648812198e-07, "kl": 0.0438232421875, "learning_rate": 1.697165102854565e-05, "loss": 0.0018, "num_tokens": 63061145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 988 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32966666666666666, "grad_norm": 1.3758707950728422e-07, "kl": 0.04705810546875, "learning_rate": 1.6963304677428096e-05, "loss": 0.0019, "num_tokens": 63135881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 989 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.33, "grad_norm": 2.8284682684898144e-07, "kl": 0.04779052734375, "learning_rate": 1.6954948899005365e-05, "loss": 0.0019, "num_tokens": 63210393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 990 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3303333333333333, "grad_norm": 2.3840239293804188e-07, "kl": 0.0447998046875, "learning_rate": 1.6946583704589973e-05, "loss": 0.0018, "num_tokens": 63284905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 991 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.33066666666666666, "grad_norm": 2.231939362218327e-07, "kl": 0.0484619140625, "learning_rate": 1.6938209105507177e-05, "loss": 0.0019, "num_tokens": 63359929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 992 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.331, "grad_norm": 2.723217562561331e-07, "kl": 0.04864501953125, "learning_rate": 1.6929825113094972e-05, "loss": 0.0019, "num_tokens": 63433769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 993 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3313333333333333, "grad_norm": 1.314094362214746e-07, "kl": 0.04608154296875, "learning_rate": 1.692143173870407e-05, "loss": 0.0018, "num_tokens": 63509225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 994 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.33166666666666667, "grad_norm": 1.4864269815006992e-07, "kl": 0.048095703125, "learning_rate": 1.6913028993697877e-05, "loss": 0.0019, "num_tokens": 63583417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 995 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.332, "grad_norm": 2.8631365012188326e-07, "kl": 0.039306640625, "learning_rate": 1.6904616889452497e-05, "loss": 0.0016, "num_tokens": 63660713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 996 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3323333333333333, "grad_norm": 2.4666348963364726e-07, "kl": 0.0465087890625, "learning_rate": 1.68961954373567e-05, "loss": 0.0019, "num_tokens": 63736633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 997 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.33266666666666667, "grad_norm": 2.489578321274166e-07, "kl": 0.0477294921875, "learning_rate": 1.688776464881191e-05, "loss": 0.0019, "num_tokens": 63813257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 998 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.333, "grad_norm": 3.299307991255773e-07, "kl": 0.04364013671875, "learning_rate": 1.6879324535232186e-05, "loss": 0.0017, "num_tokens": 63889881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 999 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3333333333333333, "grad_norm": 2.708214310587209e-07, "kl": 0.04730224609375, "learning_rate": 1.6870875108044233e-05, "loss": 0.0019, "num_tokens": 63968505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1000 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.33366666666666667, "grad_norm": 2.6537884423305513e-07, "kl": 0.04595947265625, "learning_rate": 1.686241637868734e-05, "loss": 0.0018, "num_tokens": 64045689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1001 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.334, "grad_norm": 2.0357389018954564e-07, "kl": 0.0465087890625, "learning_rate": 1.68539483586134e-05, "loss": 0.0019, "num_tokens": 64121945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1002 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3343333333333333, "grad_norm": 2.830045104929013e-07, "kl": 0.04766845703125, "learning_rate": 1.684547105928689e-05, "loss": 0.0019, "num_tokens": 64197753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1003 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.33466666666666667, "grad_norm": 1.793147390571903e-07, "kl": 0.0467529296875, "learning_rate": 1.683698449218484e-05, "loss": 0.0019, "num_tokens": 64274313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1004 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.335, "grad_norm": 2.7238968414167175e-07, "kl": 0.0477294921875, "learning_rate": 1.6828488668796836e-05, "loss": 0.0019, "num_tokens": 64347177.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1005 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3353333333333333, "grad_norm": 2.3318362707414053e-07, "kl": 0.0455322265625, "learning_rate": 1.6819983600624986e-05, "loss": 0.0018, "num_tokens": 64424089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1006 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.33566666666666667, "grad_norm": 3.2506622460459766e-07, "kl": 0.04632568359375, "learning_rate": 1.6811469299183928e-05, "loss": 0.0019, "num_tokens": 64499353.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1007 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.336, "grad_norm": 3.011139426689624e-07, "kl": 0.049072265625, "learning_rate": 1.6802945776000782e-05, "loss": 0.002, "num_tokens": 64579081.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1008 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3363333333333333, "grad_norm": 1.725312444023075e-07, "kl": 0.046142578125, "learning_rate": 1.6794413042615168e-05, "loss": 0.0018, "num_tokens": 64654201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1009 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.33666666666666667, "grad_norm": 2.756699473138724e-07, "kl": 0.04437255859375, "learning_rate": 1.6785871110579167e-05, "loss": 0.0018, "num_tokens": 64731449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1010 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.337, "grad_norm": 1.4254494828946918e-07, "kl": 0.04620361328125, "learning_rate": 1.6777319991457325e-05, "loss": 0.0018, "num_tokens": 64805033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1011 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3373333333333333, "grad_norm": 2.3731524834147422e-07, "kl": 0.04608154296875, "learning_rate": 1.6768759696826608e-05, "loss": 0.0018, "num_tokens": 64882665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1012 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.33766666666666667, "grad_norm": 2.6397302121949906e-07, "kl": 0.04644775390625, "learning_rate": 1.6760190238276418e-05, "loss": 0.0019, "num_tokens": 64959097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1013 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.338, "grad_norm": 2.2125695409158652e-07, "kl": 0.04595947265625, "learning_rate": 1.6751611627408567e-05, "loss": 0.0018, "num_tokens": 65035481.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1014 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3383333333333333, "grad_norm": 1.3229606565801078e-07, "kl": 0.046142578125, "learning_rate": 1.6743023875837233e-05, "loss": 0.0018, "num_tokens": 65111961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1015 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.33866666666666667, "grad_norm": 1.9578828869271092e-07, "kl": 0.04437255859375, "learning_rate": 1.6734426995189003e-05, "loss": 0.0018, "num_tokens": 65185833.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1016 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.339, "grad_norm": 3.02549779007677e-07, "kl": 0.0491943359375, "learning_rate": 1.6725820997102804e-05, "loss": 0.002, "num_tokens": 65264809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1017 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3393333333333333, "grad_norm": 1.3841760448940477e-07, "kl": 0.0467529296875, "learning_rate": 1.6717205893229904e-05, "loss": 0.0019, "num_tokens": 65339465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1018 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3396666666666667, "grad_norm": 2.1331956645553873e-07, "kl": 0.0465087890625, "learning_rate": 1.670858169523391e-05, "loss": 0.0019, "num_tokens": 65416201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1019 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.34, "grad_norm": 2.119055579896667e-07, "kl": 0.0433349609375, "learning_rate": 1.6699948414790734e-05, "loss": 0.0017, "num_tokens": 65492841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1020 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3403333333333333, "grad_norm": 1.1064967964102834e-07, "kl": 0.04364013671875, "learning_rate": 1.6691306063588583e-05, "loss": 0.0017, "num_tokens": 65566569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1021 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3406666666666667, "grad_norm": 2.98130061082702e-07, "kl": 0.04742431640625, "learning_rate": 1.6682654653327953e-05, "loss": 0.0019, "num_tokens": 65641817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1022 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.341, "grad_norm": 2.5687106131044857e-07, "kl": 0.0467529296875, "learning_rate": 1.66739941957216e-05, "loss": 0.0019, "num_tokens": 65719401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1023 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3413333333333333, "grad_norm": 3.3902935570040427e-07, "kl": 0.047607421875, "learning_rate": 1.6665324702494524e-05, "loss": 0.0019, "num_tokens": 65795209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1024 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3416666666666667, "grad_norm": 1.897923596061446e-07, "kl": 0.0457763671875, "learning_rate": 1.665664618538397e-05, "loss": 0.0018, "num_tokens": 65870409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1025 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.342, "grad_norm": 2.997032311213843e-07, "kl": 0.04754638671875, "learning_rate": 1.6647958656139377e-05, "loss": 0.0019, "num_tokens": 65946377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1026 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3423333333333333, "grad_norm": 1.877434669950162e-07, "kl": 0.04217529296875, "learning_rate": 1.6639262126522417e-05, "loss": 0.0017, "num_tokens": 66020441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1027 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3426666666666667, "grad_norm": 2.0538445255624538e-07, "kl": 0.0452880859375, "learning_rate": 1.663055660830692e-05, "loss": 0.0018, "num_tokens": 66096441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1028 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.343, "grad_norm": 2.991582164213469e-07, "kl": 0.04833984375, "learning_rate": 1.6621842113278902e-05, "loss": 0.0019, "num_tokens": 66171641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1029 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3433333333333333, "grad_norm": 1.7526781448395923e-07, "kl": 0.04840087890625, "learning_rate": 1.661311865323652e-05, "loss": 0.0019, "num_tokens": 66245401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1030 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3436666666666667, "grad_norm": 2.1655398541042814e-07, "kl": 0.043701171875, "learning_rate": 1.6604386239990077e-05, "loss": 0.0017, "num_tokens": 66319401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1031 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.344, "grad_norm": 3.229246203773073e-07, "kl": 0.0458984375, "learning_rate": 1.6595644885362e-05, "loss": 0.0018, "num_tokens": 66396361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1032 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3443333333333333, "grad_norm": 3.012336833307927e-07, "kl": 0.0472412109375, "learning_rate": 1.6586894601186804e-05, "loss": 0.0019, "num_tokens": 66471529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1033 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3446666666666667, "grad_norm": 2.6796817564900266e-07, "kl": 0.0479736328125, "learning_rate": 1.657813539931112e-05, "loss": 0.0019, "num_tokens": 66545801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1034 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.345, "grad_norm": 2.4642193352519826e-07, "kl": 0.0455322265625, "learning_rate": 1.6569367291593627e-05, "loss": 0.0018, "num_tokens": 66622553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1035 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3453333333333333, "grad_norm": 1.6899959121019492e-07, "kl": 0.0440673828125, "learning_rate": 1.6560590289905074e-05, "loss": 0.0018, "num_tokens": 66697433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1036 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3456666666666667, "grad_norm": 1.9668938477934717e-07, "kl": 0.05108642578125, "learning_rate": 1.655180440612825e-05, "loss": 0.002, "num_tokens": 66772169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1037 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.346, "grad_norm": 1.58758268753445e-07, "kl": 0.0435791015625, "learning_rate": 1.6543009652157973e-05, "loss": 0.0017, "num_tokens": 66848745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1038 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3463333333333333, "grad_norm": 1.5893020588464424e-07, "kl": 0.046875, "learning_rate": 1.6534206039901057e-05, "loss": 0.0019, "num_tokens": 66923465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1039 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3466666666666667, "grad_norm": 2.36673670883647e-07, "kl": 0.0457763671875, "learning_rate": 1.652539358127632e-05, "loss": 0.0018, "num_tokens": 66997833.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1040 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.347, "grad_norm": 2.716946028158418e-07, "kl": 0.046630859375, "learning_rate": 1.6516572288214555e-05, "loss": 0.0019, "num_tokens": 67073945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1041 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3473333333333333, "grad_norm": 2.588246559298568e-07, "kl": 0.04364013671875, "learning_rate": 1.650774217265851e-05, "loss": 0.0017, "num_tokens": 67151097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1042 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3476666666666667, "grad_norm": 2.486681296431925e-07, "kl": 0.0411376953125, "learning_rate": 1.649890324656289e-05, "loss": 0.0016, "num_tokens": 67227161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1043 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.348, "grad_norm": 3.447154313107603e-07, "kl": 0.0457763671875, "learning_rate": 1.649005552189431e-05, "loss": 0.0018, "num_tokens": 67304825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1044 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.34833333333333333, "grad_norm": 1.6775233291355107e-07, "kl": 0.044921875, "learning_rate": 1.6481199010631312e-05, "loss": 0.0018, "num_tokens": 67382937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1045 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3486666666666667, "grad_norm": 1.9476284762731666e-07, "kl": 0.04541015625, "learning_rate": 1.6472333724764326e-05, "loss": 0.0018, "num_tokens": 67457593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1046 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.349, "grad_norm": 3.570832802779478e-07, "kl": 0.045166015625, "learning_rate": 1.6463459676295666e-05, "loss": 0.0018, "num_tokens": 67534121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1047 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.34933333333333333, "grad_norm": 1.379593896899678e-07, "kl": 0.0467529296875, "learning_rate": 1.645457687723951e-05, "loss": 0.0019, "num_tokens": 67609129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1048 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3496666666666667, "grad_norm": 3.7264581465024094e-07, "kl": 0.046142578125, "learning_rate": 1.644568533962187e-05, "loss": 0.0018, "num_tokens": 67684617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1049 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.35, "grad_norm": 1.9051678634696145e-07, "kl": 0.04595947265625, "learning_rate": 1.643678507548061e-05, "loss": 0.0018, "num_tokens": 67759161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1050 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.35033333333333333, "grad_norm": 2.8085653980269853e-07, "kl": 0.046630859375, "learning_rate": 1.6427876096865394e-05, "loss": 0.0019, "num_tokens": 67835753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1051 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3506666666666667, "grad_norm": 2.3154112227530277e-07, "kl": 0.04364013671875, "learning_rate": 1.6418958415837688e-05, "loss": 0.0017, "num_tokens": 67909977.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1052 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.351, "grad_norm": 2.1219780421688483e-07, "kl": 0.047607421875, "learning_rate": 1.6410032044470735e-05, "loss": 0.0019, "num_tokens": 67983993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1053 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.35133333333333333, "grad_norm": 3.1413446777150966e-07, "kl": 0.04327392578125, "learning_rate": 1.6401096994849558e-05, "loss": 0.0017, "num_tokens": 68060505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1054 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3516666666666667, "grad_norm": 3.213079935449059e-07, "kl": 0.0457763671875, "learning_rate": 1.6392153279070905e-05, "loss": 0.0018, "num_tokens": 68137689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1055 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.352, "grad_norm": 1.5866876879044867e-07, "kl": 0.0489501953125, "learning_rate": 1.6383200909243285e-05, "loss": 0.002, "num_tokens": 68210649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1056 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.35233333333333333, "grad_norm": 1.5397878883049998e-07, "kl": 0.04254150390625, "learning_rate": 1.63742398974869e-05, "loss": 0.0017, "num_tokens": 68284217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1057 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3526666666666667, "grad_norm": 1.849784183605152e-07, "kl": 0.04437255859375, "learning_rate": 1.6365270255933663e-05, "loss": 0.0018, "num_tokens": 68359449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1058 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.353, "grad_norm": 3.638995735855133e-07, "kl": 0.0435791015625, "learning_rate": 1.635629199672717e-05, "loss": 0.0017, "num_tokens": 68436297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1059 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.35333333333333333, "grad_norm": 1.8141595603538008e-07, "kl": 0.04052734375, "learning_rate": 1.6347305132022677e-05, "loss": 0.0016, "num_tokens": 68511897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1060 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3536666666666667, "grad_norm": 1.6439460637229786e-07, "kl": 0.04345703125, "learning_rate": 1.63383096739871e-05, "loss": 0.0017, "num_tokens": 68587721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1061 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.354, "grad_norm": 2.125379552353479e-07, "kl": 0.041259765625, "learning_rate": 1.6329305634798993e-05, "loss": 0.0016, "num_tokens": 68661401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1062 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.35433333333333333, "grad_norm": 2.076277070273136e-07, "kl": 0.0462646484375, "learning_rate": 1.632029302664851e-05, "loss": 0.0019, "num_tokens": 68737641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1063 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3546666666666667, "grad_norm": 1.4812604831604403e-07, "kl": 0.0447998046875, "learning_rate": 1.6311271861737417e-05, "loss": 0.0018, "num_tokens": 68811705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1064 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.355, "grad_norm": 1.0499272207198374e-07, "kl": 0.04449462890625, "learning_rate": 1.6302242152279068e-05, "loss": 0.0018, "num_tokens": 68884681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1065 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.35533333333333333, "grad_norm": 1.536605083174436e-07, "kl": 0.0458984375, "learning_rate": 1.6293203910498375e-05, "loss": 0.0018, "num_tokens": 68957993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1066 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3556666666666667, "grad_norm": 0.0003515103890094906, "kl": 0.0419921875, "learning_rate": 1.6284157148631814e-05, "loss": 0.0017, "num_tokens": 69035865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1067 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.356, "grad_norm": 2.0275901135846652e-07, "kl": 0.04437255859375, "learning_rate": 1.6275101878927382e-05, "loss": 0.0018, "num_tokens": 69110729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1068 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.35633333333333334, "grad_norm": 3.8990665984783845e-07, "kl": 0.0482177734375, "learning_rate": 1.6266038113644605e-05, "loss": 0.0019, "num_tokens": 69184857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1069 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3566666666666667, "grad_norm": 9.444265458569134e-08, "kl": 0.0411376953125, "learning_rate": 1.625696586505451e-05, "loss": 0.0016, "num_tokens": 69259337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1070 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.357, "grad_norm": 2.391022917436203e-07, "kl": 0.04437255859375, "learning_rate": 1.6247885145439602e-05, "loss": 0.0018, "num_tokens": 69333945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1071 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.35733333333333334, "grad_norm": 2.569869081980869e-07, "kl": 0.0452880859375, "learning_rate": 1.6238795967093865e-05, "loss": 0.0018, "num_tokens": 69410825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1072 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3576666666666667, "grad_norm": 2.1654653892255737e-07, "kl": 0.04815673828125, "learning_rate": 1.622969834232272e-05, "loss": 0.0019, "num_tokens": 69484105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1073 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.358, "grad_norm": 2.987639504681283e-07, "kl": 0.0433349609375, "learning_rate": 1.622059228344304e-05, "loss": 0.0017, "num_tokens": 69561193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1074 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.35833333333333334, "grad_norm": 1.5380136630938068e-07, "kl": 0.04339599609375, "learning_rate": 1.6211477802783105e-05, "loss": 0.0017, "num_tokens": 69635833.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1075 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3586666666666667, "grad_norm": 2.7958495252278226e-07, "kl": 0.044677734375, "learning_rate": 1.6202354912682602e-05, "loss": 0.0018, "num_tokens": 69712041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1076 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.359, "grad_norm": 3.8320197859320615e-07, "kl": 0.0447998046875, "learning_rate": 1.6193223625492604e-05, "loss": 0.0018, "num_tokens": 69788569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1077 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.35933333333333334, "grad_norm": 3.4159901929342595e-07, "kl": 0.045654296875, "learning_rate": 1.6184083953575543e-05, "loss": 0.0018, "num_tokens": 69870809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1078 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3596666666666667, "grad_norm": 2.1942899763871537e-07, "kl": 0.052734375, "learning_rate": 1.6174935909305216e-05, "loss": 0.0021, "num_tokens": 69947705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1079 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36, "grad_norm": 1.8553926395270537e-07, "kl": 0.042236328125, "learning_rate": 1.616577950506675e-05, "loss": 0.0017, "num_tokens": 70022473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1080 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36033333333333334, "grad_norm": 3.3929265441656753e-07, "kl": 0.0478515625, "learning_rate": 1.6156614753256583e-05, "loss": 0.0019, "num_tokens": 70099321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1081 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3606666666666667, "grad_norm": 3.1992675531000714e-07, "kl": 0.0438232421875, "learning_rate": 1.614744166628247e-05, "loss": 0.0018, "num_tokens": 70175673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1082 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.361, "grad_norm": 2.317240017646327e-07, "kl": 0.04638671875, "learning_rate": 1.613826025656343e-05, "loss": 0.0019, "num_tokens": 70253801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1083 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36133333333333334, "grad_norm": 1.5721208512786689e-07, "kl": 0.0457763671875, "learning_rate": 1.6129070536529767e-05, "loss": 0.0018, "num_tokens": 70328233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1084 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3616666666666667, "grad_norm": 1.594924441405965e-07, "kl": 0.0447998046875, "learning_rate": 1.611987251862303e-05, "loss": 0.0018, "num_tokens": 70403433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1085 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.362, "grad_norm": 2.429415246751887e-07, "kl": 0.04754638671875, "learning_rate": 1.6110666215296e-05, "loss": 0.0019, "num_tokens": 70480297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1086 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36233333333333334, "grad_norm": 2.861544032839447e-07, "kl": 0.04266357421875, "learning_rate": 1.610145163901268e-05, "loss": 0.0017, "num_tokens": 70556105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1087 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3626666666666667, "grad_norm": 1.616263176629218e-07, "kl": 0.0445556640625, "learning_rate": 1.6092228802248264e-05, "loss": 0.0018, "num_tokens": 70629801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1088 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.363, "grad_norm": 2.057961694390542e-07, "kl": 0.0465087890625, "learning_rate": 1.6082997717489145e-05, "loss": 0.0019, "num_tokens": 70707177.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1089 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36333333333333334, "grad_norm": 2.0469707351367106e-07, "kl": 0.046875, "learning_rate": 1.607375839723287e-05, "loss": 0.0019, "num_tokens": 70781705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1090 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3636666666666667, "grad_norm": 2.581525961886655e-07, "kl": 0.04473876953125, "learning_rate": 1.6064510853988137e-05, "loss": 0.0018, "num_tokens": 70858025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1091 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.364, "grad_norm": 1.979224180104211e-07, "kl": 0.044921875, "learning_rate": 1.605525510027478e-05, "loss": 0.0018, "num_tokens": 70934425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1092 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36433333333333334, "grad_norm": 1.7411876740425214e-07, "kl": 0.04656982421875, "learning_rate": 1.6045991148623752e-05, "loss": 0.0019, "num_tokens": 71010345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1093 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36466666666666664, "grad_norm": 1.1158581258996492e-07, "kl": 0.04327392578125, "learning_rate": 1.6036719011577094e-05, "loss": 0.0017, "num_tokens": 71083833.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1094 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.365, "grad_norm": 2.1587241860743234e-07, "kl": 0.0477294921875, "learning_rate": 1.6027438701687937e-05, "loss": 0.0019, "num_tokens": 71162681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1095 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36533333333333334, "grad_norm": 2.580104307980946e-07, "kl": 0.047119140625, "learning_rate": 1.6018150231520486e-05, "loss": 0.0019, "num_tokens": 71239001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1096 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36566666666666664, "grad_norm": 2.466824184921279e-07, "kl": 0.0445556640625, "learning_rate": 1.600885361364997e-05, "loss": 0.0018, "num_tokens": 71313497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1097 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.366, "grad_norm": 1.476787048204642e-07, "kl": 0.04803466796875, "learning_rate": 1.5999548860662666e-05, "loss": 0.0019, "num_tokens": 71387737.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1098 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36633333333333334, "grad_norm": 2.01558890466913e-07, "kl": 0.04541015625, "learning_rate": 1.599023598515586e-05, "loss": 0.0018, "num_tokens": 71462217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1099 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36666666666666664, "grad_norm": 1.8711165239437832e-07, "kl": 0.044921875, "learning_rate": 1.598091499973784e-05, "loss": 0.0018, "num_tokens": 71536489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.367, "grad_norm": 1.7722781819884403e-07, "kl": 0.0440673828125, "learning_rate": 1.5971585917027864e-05, "loss": 0.0018, "num_tokens": 71611337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36733333333333335, "grad_norm": 1.782724154963944e-07, "kl": 0.046142578125, "learning_rate": 1.5962248749656158e-05, "loss": 0.0018, "num_tokens": 71687113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36766666666666664, "grad_norm": 1.560152611546073e-07, "kl": 0.04559326171875, "learning_rate": 1.59529035102639e-05, "loss": 0.0018, "num_tokens": 71762393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.368, "grad_norm": 3.2420365414509433e-07, "kl": 0.04632568359375, "learning_rate": 1.594355021150318e-05, "loss": 0.0019, "num_tokens": 71837897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36833333333333335, "grad_norm": 2.2718381842423696e-07, "kl": 0.0438232421875, "learning_rate": 1.5934188866037017e-05, "loss": 0.0018, "num_tokens": 71913209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36866666666666664, "grad_norm": 2.565647605479171e-07, "kl": 0.0443115234375, "learning_rate": 1.592481948653931e-05, "loss": 0.0018, "num_tokens": 71988409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.369, "grad_norm": 3.3098220342253626e-07, "kl": 0.046142578125, "learning_rate": 1.591544208569484e-05, "loss": 0.0018, "num_tokens": 72069433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36933333333333335, "grad_norm": 1.6562962912303192e-07, "kl": 0.045654296875, "learning_rate": 1.5906056676199256e-05, "loss": 0.0018, "num_tokens": 72147337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36966666666666664, "grad_norm": 1.5054055779728515e-07, "kl": 0.0433349609375, "learning_rate": 1.5896663270759034e-05, "loss": 0.0017, "num_tokens": 72221961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37, "grad_norm": 2.499229196928354e-07, "kl": 0.043212890625, "learning_rate": 1.5887261882091488e-05, "loss": 0.0017, "num_tokens": 72297225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37033333333333335, "grad_norm": 1.498172537139908e-07, "kl": 0.04901123046875, "learning_rate": 1.5877852522924733e-05, "loss": 0.002, "num_tokens": 72371897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37066666666666664, "grad_norm": 2.1787293746911018e-07, "kl": 0.04730224609375, "learning_rate": 1.586843520599768e-05, "loss": 0.0019, "num_tokens": 72447865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.371, "grad_norm": 1.8467947882072622e-07, "kl": 0.04278564453125, "learning_rate": 1.5859009944060005e-05, "loss": 0.0017, "num_tokens": 72524617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37133333333333335, "grad_norm": 1.8448743333010498e-07, "kl": 0.0465087890625, "learning_rate": 1.584957674987216e-05, "loss": 0.0019, "num_tokens": 72600009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37166666666666665, "grad_norm": 1.733871641818041e-07, "kl": 0.04290771484375, "learning_rate": 1.5840135636205305e-05, "loss": 0.0017, "num_tokens": 72674601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.372, "grad_norm": 3.3537230592628475e-07, "kl": 0.051025390625, "learning_rate": 1.5830686615841348e-05, "loss": 0.002, "num_tokens": 72752153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37233333333333335, "grad_norm": 2.052071295111091e-07, "kl": 0.0457763671875, "learning_rate": 1.5821229701572897e-05, "loss": 0.0018, "num_tokens": 72826473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37266666666666665, "grad_norm": 4.1592105048948724e-07, "kl": 0.046875, "learning_rate": 1.5811764906203235e-05, "loss": 0.0019, "num_tokens": 72903049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.373, "grad_norm": 2.8275951535761124e-07, "kl": 0.05181884765625, "learning_rate": 1.580229224254633e-05, "loss": 0.0021, "num_tokens": 72980441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37333333333333335, "grad_norm": 2.4958129074548197e-07, "kl": 0.0465087890625, "learning_rate": 1.5792811723426787e-05, "loss": 0.0019, "num_tokens": 73055465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37366666666666665, "grad_norm": 2.344265084275321e-07, "kl": 0.04681396484375, "learning_rate": 1.5783323361679865e-05, "loss": 0.0019, "num_tokens": 73131529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.374, "grad_norm": 1.1609753158836611e-07, "kl": 0.04510498046875, "learning_rate": 1.5773827170151425e-05, "loss": 0.0018, "num_tokens": 73209001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37433333333333335, "grad_norm": 1.638546081039749e-07, "kl": 0.04376220703125, "learning_rate": 1.5764323161697933e-05, "loss": 0.0018, "num_tokens": 73283513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37466666666666665, "grad_norm": 1.9412206597735349e-07, "kl": 0.04345703125, "learning_rate": 1.5754811349186443e-05, "loss": 0.0017, "num_tokens": 73359273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.375, "grad_norm": 1.7888619652239868e-07, "kl": 0.0482177734375, "learning_rate": 1.5745291745494563e-05, "loss": 0.0019, "num_tokens": 73434777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37533333333333335, "grad_norm": 2.2083835915509553e-07, "kl": 0.04522705078125, "learning_rate": 1.573576436351046e-05, "loss": 0.0018, "num_tokens": 73509593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37566666666666665, "grad_norm": 2.259782405644728e-07, "kl": 0.04559326171875, "learning_rate": 1.5726229216132835e-05, "loss": 0.0018, "num_tokens": 73587641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.376, "grad_norm": 1.266240872155322e-07, "kl": 0.050048828125, "learning_rate": 1.5716686316270884e-05, "loss": 0.002, "num_tokens": 73661913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37633333333333335, "grad_norm": 1.7839907684447098e-07, "kl": 0.04559326171875, "learning_rate": 1.570713567684432e-05, "loss": 0.0018, "num_tokens": 73737017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37666666666666665, "grad_norm": 2.0319306770488765e-07, "kl": 0.04742431640625, "learning_rate": 1.5697577310783318e-05, "loss": 0.0019, "num_tokens": 73811897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.377, "grad_norm": 2.0491488328389096e-07, "kl": 0.047607421875, "learning_rate": 1.568801123102852e-05, "loss": 0.0019, "num_tokens": 73886073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37733333333333335, "grad_norm": 2.960838685339695e-07, "kl": 0.04779052734375, "learning_rate": 1.5678437450531014e-05, "loss": 0.0019, "num_tokens": 73963897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37766666666666665, "grad_norm": 2.704465487113339e-07, "kl": 0.04620361328125, "learning_rate": 1.566885598225231e-05, "loss": 0.0018, "num_tokens": 74041833.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.378, "grad_norm": 2.1534745542339806e-07, "kl": 0.044921875, "learning_rate": 1.565926683916433e-05, "loss": 0.0018, "num_tokens": 74117753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37833333333333335, "grad_norm": 2.9199301820881374e-07, "kl": 0.045166015625, "learning_rate": 1.564967003424938e-05, "loss": 0.0018, "num_tokens": 74198281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37866666666666665, "grad_norm": 1.2356058221030253e-07, "kl": 0.043701171875, "learning_rate": 1.5640065580500146e-05, "loss": 0.0017, "num_tokens": 74273209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.379, "grad_norm": 1.8146216973491391e-07, "kl": 0.04718017578125, "learning_rate": 1.5630453490919663e-05, "loss": 0.0019, "num_tokens": 74349257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37933333333333336, "grad_norm": 2.1285757156874752e-07, "kl": 0.04803466796875, "learning_rate": 1.5620833778521306e-05, "loss": 0.0019, "num_tokens": 74426537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37966666666666665, "grad_norm": 2.053234311460983e-07, "kl": 0.0443115234375, "learning_rate": 1.561120645632878e-05, "loss": 0.0018, "num_tokens": 74502537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38, "grad_norm": 2.053631504850273e-07, "kl": 0.04290771484375, "learning_rate": 1.560157153737607e-05, "loss": 0.0017, "num_tokens": 74575433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38033333333333336, "grad_norm": 1.65422747500088e-07, "kl": 0.0465087890625, "learning_rate": 1.5591929034707468e-05, "loss": 0.0019, "num_tokens": 74650553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38066666666666665, "grad_norm": 1.960008546575409e-07, "kl": 0.04534912109375, "learning_rate": 1.5582278961377524e-05, "loss": 0.0018, "num_tokens": 74725641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.381, "grad_norm": 2.108239982590021e-07, "kl": 0.0445556640625, "learning_rate": 1.5572621330451044e-05, "loss": 0.0018, "num_tokens": 74802345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38133333333333336, "grad_norm": 2.5135875603155e-07, "kl": 0.04498291015625, "learning_rate": 1.556295615500305e-05, "loss": 0.0018, "num_tokens": 74878633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38166666666666665, "grad_norm": 1.6932575874761824e-07, "kl": 0.045166015625, "learning_rate": 1.5553283448118795e-05, "loss": 0.0018, "num_tokens": 74954441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.382, "grad_norm": 2.2136832455998956e-07, "kl": 0.04522705078125, "learning_rate": 1.5543603222893718e-05, "loss": 0.0018, "num_tokens": 75031225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38233333333333336, "grad_norm": 9.68626707731346e-08, "kl": 0.04071044921875, "learning_rate": 1.553391549243344e-05, "loss": 0.0016, "num_tokens": 75111865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38266666666666665, "grad_norm": 3.990126060671173e-07, "kl": 0.046875, "learning_rate": 1.5524220269853754e-05, "loss": 0.0019, "num_tokens": 75189449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.383, "grad_norm": 1.8346602814744983e-07, "kl": 0.04193115234375, "learning_rate": 1.5514517568280573e-05, "loss": 0.0017, "num_tokens": 75266153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38333333333333336, "grad_norm": 2.1014508888583805e-07, "kl": 0.0460205078125, "learning_rate": 1.5504807400849957e-05, "loss": 0.0018, "num_tokens": 75340585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38366666666666666, "grad_norm": 1.6201340713450918e-07, "kl": 0.0462646484375, "learning_rate": 1.5495089780708062e-05, "loss": 0.0019, "num_tokens": 75414937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.384, "grad_norm": 1.5455056256996613e-07, "kl": 0.0445556640625, "learning_rate": 1.548536472101114e-05, "loss": 0.0018, "num_tokens": 75489113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38433333333333336, "grad_norm": 2.0146632095929817e-07, "kl": 0.0430908203125, "learning_rate": 1.5475632234925505e-05, "loss": 0.0017, "num_tokens": 75564265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38466666666666666, "grad_norm": 2.7232809429733607e-07, "kl": 0.049072265625, "learning_rate": 1.5465892335627537e-05, "loss": 0.002, "num_tokens": 75640489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.385, "grad_norm": 1.070246256063001e-07, "kl": 0.04351806640625, "learning_rate": 1.545614503630365e-05, "loss": 0.0017, "num_tokens": 75714009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38533333333333336, "grad_norm": 1.6892342102892144e-07, "kl": 0.04608154296875, "learning_rate": 1.5446390350150272e-05, "loss": 0.0018, "num_tokens": 75789113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38566666666666666, "grad_norm": 1.3307726476341486e-07, "kl": 0.04327392578125, "learning_rate": 1.5436628290373835e-05, "loss": 0.0017, "num_tokens": 75867945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.386, "grad_norm": 2.0743829054481466e-07, "kl": 0.04345703125, "learning_rate": 1.542685887019075e-05, "loss": 0.0017, "num_tokens": 75943657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3863333333333333, "grad_norm": 2.1642328817961243e-07, "kl": 0.047119140625, "learning_rate": 1.54170821028274e-05, "loss": 0.0019, "num_tokens": 76020585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38666666666666666, "grad_norm": 1.1275442091118748e-07, "kl": 0.04425048828125, "learning_rate": 1.5407298001520108e-05, "loss": 0.0018, "num_tokens": 76097641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.387, "grad_norm": 1.664246696009286e-07, "kl": 0.04681396484375, "learning_rate": 1.539750657951513e-05, "loss": 0.0019, "num_tokens": 76182489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3873333333333333, "grad_norm": 1.529209043837909e-07, "kl": 0.04425048828125, "learning_rate": 1.5387707850068633e-05, "loss": 0.0018, "num_tokens": 76260089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38766666666666666, "grad_norm": 1.7008896691095288e-07, "kl": 0.04443359375, "learning_rate": 1.5377901826446672e-05, "loss": 0.0018, "num_tokens": 76333945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.388, "grad_norm": 1.8821098990429164e-07, "kl": 0.04425048828125, "learning_rate": 1.5368088521925185e-05, "loss": 0.0018, "num_tokens": 76410937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3883333333333333, "grad_norm": 2.3498549239775457e-07, "kl": 0.04876708984375, "learning_rate": 1.5358267949789968e-05, "loss": 0.002, "num_tokens": 76489929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38866666666666666, "grad_norm": 1.8069387408559123e-07, "kl": 0.04754638671875, "learning_rate": 1.5348440123336647e-05, "loss": 0.0019, "num_tokens": 76566425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.389, "grad_norm": 1.5659249186228408e-07, "kl": 0.046875, "learning_rate": 1.533860505587067e-05, "loss": 0.0019, "num_tokens": 76644281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3893333333333333, "grad_norm": 1.54360108695073e-07, "kl": 0.04571533203125, "learning_rate": 1.53287627607073e-05, "loss": 0.0018, "num_tokens": 76719577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38966666666666666, "grad_norm": 1.8486534258954634e-07, "kl": 0.04998779296875, "learning_rate": 1.531891325117158e-05, "loss": 0.002, "num_tokens": 76795849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.39, "grad_norm": 4.238993369654054e-07, "kl": 0.04833984375, "learning_rate": 1.530905654059831e-05, "loss": 0.0019, "num_tokens": 76873849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3903333333333333, "grad_norm": 2.80409778952162e-07, "kl": 0.047607421875, "learning_rate": 1.529919264233205e-05, "loss": 0.0019, "num_tokens": 76949305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.39066666666666666, "grad_norm": 2.9063102147119935e-07, "kl": 0.04736328125, "learning_rate": 1.5289321569727093e-05, "loss": 0.0019, "num_tokens": 77025401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.391, "grad_norm": 4.264112476448645e-07, "kl": 0.046875, "learning_rate": 1.5279443336147437e-05, "loss": 0.0019, "num_tokens": 77103433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3913333333333333, "grad_norm": 1.6834538030252588e-07, "kl": 0.04571533203125, "learning_rate": 1.5269557954966777e-05, "loss": 0.0018, "num_tokens": 77178249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.39166666666666666, "grad_norm": 3.0631557024207723e-07, "kl": 0.04766845703125, "learning_rate": 1.525966543956849e-05, "loss": 0.0019, "num_tokens": 77254729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.392, "grad_norm": 2.1066006183900754e-07, "kl": 0.04644775390625, "learning_rate": 1.5249765803345602e-05, "loss": 0.0019, "num_tokens": 77331001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3923333333333333, "grad_norm": 1.3322177494501375e-07, "kl": 0.046142578125, "learning_rate": 1.5239859059700794e-05, "loss": 0.0018, "num_tokens": 77404425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.39266666666666666, "grad_norm": 2.616755807594018e-07, "kl": 0.04180908203125, "learning_rate": 1.5229945222046354e-05, "loss": 0.0017, "num_tokens": 77481529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.393, "grad_norm": 2.779126191398973e-07, "kl": 0.044677734375, "learning_rate": 1.5220024303804181e-05, "loss": 0.0018, "num_tokens": 77559561.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3933333333333333, "grad_norm": 2.0741408945923467e-07, "kl": 0.04693603515625, "learning_rate": 1.5210096318405768e-05, "loss": 0.0019, "num_tokens": 77635225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.39366666666666666, "grad_norm": 3.0884777402206964e-07, "kl": 0.04534912109375, "learning_rate": 1.5200161279292154e-05, "loss": 0.0018, "num_tokens": 77711577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.394, "grad_norm": 0.00032706017373129725, "kl": 0.046142578125, "learning_rate": 1.5190219199913956e-05, "loss": 0.0018, "num_tokens": 77790569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3943333333333333, "grad_norm": 2.1506916425551026e-07, "kl": 0.04388427734375, "learning_rate": 1.5180270093731305e-05, "loss": 0.0018, "num_tokens": 77866937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.39466666666666667, "grad_norm": 2.786361790185765e-07, "kl": 0.04412841796875, "learning_rate": 1.5170313974213841e-05, "loss": 0.0018, "num_tokens": 77943721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.395, "grad_norm": 2.4016597421905317e-07, "kl": 0.04547119140625, "learning_rate": 1.5160350854840715e-05, "loss": 0.0018, "num_tokens": 78019817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3953333333333333, "grad_norm": 1.274765253356236e-07, "kl": 0.04339599609375, "learning_rate": 1.5150380749100545e-05, "loss": 0.0017, "num_tokens": 78095721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.39566666666666667, "grad_norm": 3.0003371875864104e-07, "kl": 0.0465087890625, "learning_rate": 1.5140403670491406e-05, "loss": 0.0019, "num_tokens": 78171657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.396, "grad_norm": 2.5176740336974035e-07, "kl": 0.04742431640625, "learning_rate": 1.5130419632520814e-05, "loss": 0.0019, "num_tokens": 78247065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3963333333333333, "grad_norm": 2.1937216843070928e-07, "kl": 0.04278564453125, "learning_rate": 1.5120428648705716e-05, "loss": 0.0017, "num_tokens": 78322809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.39666666666666667, "grad_norm": 2.1508839154193993e-07, "kl": 0.0479736328125, "learning_rate": 1.5110430732572454e-05, "loss": 0.0019, "num_tokens": 78397609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.397, "grad_norm": 8.586322763903809e-08, "kl": 0.04803466796875, "learning_rate": 1.5100425897656754e-05, "loss": 0.0019, "num_tokens": 78471801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3973333333333333, "grad_norm": 1.683656876139139e-07, "kl": 0.04522705078125, "learning_rate": 1.5090414157503715e-05, "loss": 0.0018, "num_tokens": 78547721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.39766666666666667, "grad_norm": 1.384997005970945e-07, "kl": 0.04327392578125, "learning_rate": 1.508039552566778e-05, "loss": 0.0017, "num_tokens": 78622745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.398, "grad_norm": 2.0212321771850839e-07, "kl": 0.04644775390625, "learning_rate": 1.5070370015712727e-05, "loss": 0.0019, "num_tokens": 78698265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3983333333333333, "grad_norm": 1.7368222415825585e-07, "kl": 0.0447998046875, "learning_rate": 1.5060337641211637e-05, "loss": 0.0018, "num_tokens": 78776089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.39866666666666667, "grad_norm": 1.2539558724711242e-07, "kl": 0.04925537109375, "learning_rate": 1.5050298415746903e-05, "loss": 0.002, "num_tokens": 78851273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.399, "grad_norm": 2.0768104036505974e-07, "kl": 0.0452880859375, "learning_rate": 1.5040252352910168e-05, "loss": 0.0018, "num_tokens": 78929033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3993333333333333, "grad_norm": 1.5647692919174006e-07, "kl": 0.04742431640625, "learning_rate": 1.5030199466302354e-05, "loss": 0.0019, "num_tokens": 79003129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.39966666666666667, "grad_norm": 1.1520349829652332e-07, "kl": 0.043212890625, "learning_rate": 1.5020139769533604e-05, "loss": 0.0017, "num_tokens": 79076185.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4, "grad_norm": 1.4021510708062124e-07, "kl": 0.0445556640625, "learning_rate": 1.5010073276223295e-05, "loss": 0.0018, "num_tokens": 79149657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1200 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4003333333333333, "grad_norm": 1.8614881014400453e-07, "kl": 0.04443359375, "learning_rate": 1.5000000000000002e-05, "loss": 0.0018, "num_tokens": 79225913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.40066666666666667, "grad_norm": 1.2410062311118963e-07, "kl": 0.04364013671875, "learning_rate": 1.4989919954501474e-05, "loss": 0.0017, "num_tokens": 79300089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.401, "grad_norm": 2.0684252888258925e-07, "kl": 0.04833984375, "learning_rate": 1.4979833153374636e-05, "loss": 0.0019, "num_tokens": 79377321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4013333333333333, "grad_norm": 1.9826116215426737e-07, "kl": 0.0465087890625, "learning_rate": 1.4969739610275556e-05, "loss": 0.0019, "num_tokens": 79452633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.40166666666666667, "grad_norm": 1.449653552754171e-07, "kl": 0.048583984375, "learning_rate": 1.4959639338869423e-05, "loss": 0.0019, "num_tokens": 79526681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.402, "grad_norm": 1.6206243458327663e-07, "kl": 0.04620361328125, "learning_rate": 1.4949532352830543e-05, "loss": 0.0018, "num_tokens": 79601065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4023333333333333, "grad_norm": 2.899886055729439e-07, "kl": 0.050048828125, "learning_rate": 1.493941866584231e-05, "loss": 0.002, "num_tokens": 79676745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4026666666666667, "grad_norm": 2.1126996330167458e-07, "kl": 0.0498046875, "learning_rate": 1.4929298291597195e-05, "loss": 0.002, "num_tokens": 79751977.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.403, "grad_norm": 2.8342509494905244e-07, "kl": 0.04742431640625, "learning_rate": 1.4919171243796706e-05, "loss": 0.0019, "num_tokens": 79827577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4033333333333333, "grad_norm": 1.6033871474974148e-07, "kl": 0.04632568359375, "learning_rate": 1.490903753615141e-05, "loss": 0.0019, "num_tokens": 79902825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1210 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4036666666666667, "grad_norm": 2.413310653537337e-07, "kl": 0.0455322265625, "learning_rate": 1.4898897182380872e-05, "loss": 0.0018, "num_tokens": 79980281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.404, "grad_norm": 1.9254416372405103e-07, "kl": 0.04681396484375, "learning_rate": 1.4888750196213661e-05, "loss": 0.0019, "num_tokens": 80055401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4043333333333333, "grad_norm": 2.6125357521777914e-07, "kl": 0.0460205078125, "learning_rate": 1.4878596591387329e-05, "loss": 0.0018, "num_tokens": 80136905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4046666666666667, "grad_norm": 2.127672473761777e-07, "kl": 0.04913330078125, "learning_rate": 1.486843638164838e-05, "loss": 0.002, "num_tokens": 80220537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1214 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.405, "grad_norm": 1.314412116926178e-07, "kl": 0.04498291015625, "learning_rate": 1.4858269580752272e-05, "loss": 0.0018, "num_tokens": 80295337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4053333333333333, "grad_norm": 1.46661093936018e-07, "kl": 0.041748046875, "learning_rate": 1.4848096202463373e-05, "loss": 0.0017, "num_tokens": 80375017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4056666666666667, "grad_norm": 1.2819751304959937e-07, "kl": 0.04815673828125, "learning_rate": 1.4837916260554966e-05, "loss": 0.0019, "num_tokens": 80453433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.406, "grad_norm": 1.4237167533792672e-07, "kl": 0.04864501953125, "learning_rate": 1.4827729768809215e-05, "loss": 0.0019, "num_tokens": 80528857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4063333333333333, "grad_norm": 1.5206752834728832e-07, "kl": 0.04400634765625, "learning_rate": 1.4817536741017153e-05, "loss": 0.0018, "num_tokens": 80603961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4066666666666667, "grad_norm": 2.4158873657142976e-07, "kl": 0.044189453125, "learning_rate": 1.4807337190978666e-05, "loss": 0.0018, "num_tokens": 80683657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1220 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.407, "grad_norm": 2.102984524299245e-07, "kl": 0.044189453125, "learning_rate": 1.4797131132502464e-05, "loss": 0.0018, "num_tokens": 80758361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4073333333333333, "grad_norm": 1.6987705464543978e-07, "kl": 0.040771484375, "learning_rate": 1.478691857940607e-05, "loss": 0.0016, "num_tokens": 80834777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4076666666666667, "grad_norm": 1.2850320274537808e-07, "kl": 0.04522705078125, "learning_rate": 1.47766995455158e-05, "loss": 0.0018, "num_tokens": 80908665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.408, "grad_norm": 2.0308661419221608e-07, "kl": 0.044677734375, "learning_rate": 1.4766474044666748e-05, "loss": 0.0018, "num_tokens": 80983545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4083333333333333, "grad_norm": 1.6899363686206925e-07, "kl": 0.05181884765625, "learning_rate": 1.4756242090702756e-05, "loss": 0.0021, "num_tokens": 81062521.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4086666666666667, "grad_norm": 1.781420309043824e-07, "kl": 0.048095703125, "learning_rate": 1.4746003697476406e-05, "loss": 0.0019, "num_tokens": 81140249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.409, "grad_norm": 1.2896579448806733e-07, "kl": 0.04437255859375, "learning_rate": 1.4735758878849e-05, "loss": 0.0018, "num_tokens": 81218105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4093333333333333, "grad_norm": 1.3127252884714835e-07, "kl": 0.046142578125, "learning_rate": 1.4725507648690542e-05, "loss": 0.0018, "num_tokens": 81292457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1228 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4096666666666667, "grad_norm": 2.1874187439152593e-07, "kl": 0.04534912109375, "learning_rate": 1.4715250020879705e-05, "loss": 0.0018, "num_tokens": 81366745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1229 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.41, "grad_norm": 1.505979128069157e-07, "kl": 0.04302978515625, "learning_rate": 1.4704986009303833e-05, "loss": 0.0017, "num_tokens": 81442249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1230 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4103333333333333, "grad_norm": 2.496243212135596e-07, "kl": 0.046875, "learning_rate": 1.469471562785891e-05, "loss": 0.0019, "num_tokens": 81520809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1231 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4106666666666667, "grad_norm": 0.00035431934520602226, "kl": 0.04376220703125, "learning_rate": 1.4684438890449542e-05, "loss": 0.0018, "num_tokens": 81595593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.411, "grad_norm": 1.585467970244281e-07, "kl": 0.04498291015625, "learning_rate": 1.4674155810988944e-05, "loss": 0.0018, "num_tokens": 81671913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1233 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.41133333333333333, "grad_norm": 1.5489291627091006e-07, "kl": 0.04461669921875, "learning_rate": 1.4663866403398915e-05, "loss": 0.0018, "num_tokens": 81746793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1234 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4116666666666667, "grad_norm": 2.075672256296457e-07, "kl": 0.04388427734375, "learning_rate": 1.4653570681609816e-05, "loss": 0.0018, "num_tokens": 81822601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1235 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.412, "grad_norm": 1.7507488792034565e-07, "kl": 0.0443115234375, "learning_rate": 1.4643268659560571e-05, "loss": 0.0018, "num_tokens": 81904633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1236 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.41233333333333333, "grad_norm": 1.3837284029705188e-07, "kl": 0.04388427734375, "learning_rate": 1.463296035119862e-05, "loss": 0.0018, "num_tokens": 81980345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1237 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4126666666666667, "grad_norm": 1.48684009104727e-07, "kl": 0.04254150390625, "learning_rate": 1.4622645770479915e-05, "loss": 0.0017, "num_tokens": 82056985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1238 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.413, "grad_norm": 1.086328538235648e-07, "kl": 0.041748046875, "learning_rate": 1.4612324931368909e-05, "loss": 0.0017, "num_tokens": 82130793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1239 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.41333333333333333, "grad_norm": 2.029710941542362e-07, "kl": 0.04400634765625, "learning_rate": 1.4601997847838518e-05, "loss": 0.0018, "num_tokens": 82207657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1240 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4136666666666667, "grad_norm": 1.904505495531339e-07, "kl": 0.04705810546875, "learning_rate": 1.4591664533870118e-05, "loss": 0.0019, "num_tokens": 82282425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1241 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.414, "grad_norm": 1.70188158676865e-07, "kl": 0.04486083984375, "learning_rate": 1.458132500345352e-05, "loss": 0.0018, "num_tokens": 82358569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1242 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.41433333333333333, "grad_norm": 1.702246237300642e-07, "kl": 0.04901123046875, "learning_rate": 1.4570979270586944e-05, "loss": 0.002, "num_tokens": 82434393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1243 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4146666666666667, "grad_norm": 1.6712797901163867e-07, "kl": 0.04791259765625, "learning_rate": 1.4560627349277017e-05, "loss": 0.0019, "num_tokens": 82510425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1244 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.415, "grad_norm": 1.8289907188773213e-07, "kl": 0.04327392578125, "learning_rate": 1.4550269253538739e-05, "loss": 0.0017, "num_tokens": 82584985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1245 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.41533333333333333, "grad_norm": 2.3691565331773745e-07, "kl": 0.045166015625, "learning_rate": 1.4539904997395468e-05, "loss": 0.0018, "num_tokens": 82660153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1246 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4156666666666667, "grad_norm": 2.854472143098974e-07, "kl": 0.04815673828125, "learning_rate": 1.452953459487891e-05, "loss": 0.0019, "num_tokens": 82737913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1247 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.416, "grad_norm": 1.0733805311247124e-07, "kl": 0.04669189453125, "learning_rate": 1.4519158060029081e-05, "loss": 0.0019, "num_tokens": 82812345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1248 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.41633333333333333, "grad_norm": 1.1579354719515322e-07, "kl": 0.04541015625, "learning_rate": 1.4508775406894308e-05, "loss": 0.0018, "num_tokens": 82889769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1249 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4166666666666667, "grad_norm": 9.00773287071388e-08, "kl": 0.04547119140625, "learning_rate": 1.4498386649531198e-05, "loss": 0.0018, "num_tokens": 82964649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.417, "grad_norm": 1.5067870151597162e-07, "kl": 0.04046630859375, "learning_rate": 1.4487991802004625e-05, "loss": 0.0016, "num_tokens": 83039529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1251 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.41733333333333333, "grad_norm": 1.863725742623501e-07, "kl": 0.0452880859375, "learning_rate": 1.4477590878387697e-05, "loss": 0.0018, "num_tokens": 83115241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1252 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4176666666666667, "grad_norm": 1.9447332988420385e-07, "kl": 0.045654296875, "learning_rate": 1.4467183892761769e-05, "loss": 0.0018, "num_tokens": 83191865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1253 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.418, "grad_norm": 1.9787889016242843e-07, "kl": 0.04180908203125, "learning_rate": 1.4456770859216383e-05, "loss": 0.0017, "num_tokens": 83270009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.41833333333333333, "grad_norm": 2.1966877739032498e-07, "kl": 0.0504150390625, "learning_rate": 1.4446351791849276e-05, "loss": 0.002, "num_tokens": 83348457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1255 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4186666666666667, "grad_norm": 1.6139181013841153e-07, "kl": 0.04248046875, "learning_rate": 1.4435926704766364e-05, "loss": 0.0017, "num_tokens": 83423993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1256 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.419, "grad_norm": 2.2698210955240938e-07, "kl": 0.04669189453125, "learning_rate": 1.442549561208169e-05, "loss": 0.0019, "num_tokens": 83499577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1257 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.41933333333333334, "grad_norm": 2.0261242639207921e-07, "kl": 0.04986572265625, "learning_rate": 1.4415058527917454e-05, "loss": 0.002, "num_tokens": 83575897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1258 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4196666666666667, "grad_norm": 9.265646383482817e-08, "kl": 0.04656982421875, "learning_rate": 1.4404615466403951e-05, "loss": 0.0019, "num_tokens": 83651193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1259 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.42, "grad_norm": 1.5757623827994394e-07, "kl": 0.04547119140625, "learning_rate": 1.439416644167957e-05, "loss": 0.0018, "num_tokens": 83727785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1260 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.42033333333333334, "grad_norm": 1.3037285384598363e-07, "kl": 0.04705810546875, "learning_rate": 1.4383711467890776e-05, "loss": 0.0019, "num_tokens": 83801673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1261 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4206666666666667, "grad_norm": 8.008100849110633e-08, "kl": 0.04443359375, "learning_rate": 1.4373250559192088e-05, "loss": 0.0018, "num_tokens": 83876169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1262 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.421, "grad_norm": 1.897480501611426e-07, "kl": 0.04693603515625, "learning_rate": 1.4362783729746068e-05, "loss": 0.0019, "num_tokens": 83951737.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1263 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.42133333333333334, "grad_norm": 9.561996705542697e-08, "kl": 0.04522705078125, "learning_rate": 1.4352310993723277e-05, "loss": 0.0018, "num_tokens": 84030009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1264 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4216666666666667, "grad_norm": 1.865805643319618e-07, "kl": 0.04736328125, "learning_rate": 1.4341832365302282e-05, "loss": 0.0019, "num_tokens": 84108601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1265 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.422, "grad_norm": 1.5509465356444707e-07, "kl": 0.04541015625, "learning_rate": 1.4331347858669631e-05, "loss": 0.0018, "num_tokens": 84183753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1266 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.42233333333333334, "grad_norm": 1.7682758368664508e-07, "kl": 0.0458984375, "learning_rate": 1.4320857488019826e-05, "loss": 0.0018, "num_tokens": 84260745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1267 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4226666666666667, "grad_norm": 1.6690631809979095e-07, "kl": 0.04571533203125, "learning_rate": 1.4310361267555302e-05, "loss": 0.0018, "num_tokens": 84337145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1268 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.423, "grad_norm": 1.7728550005813304e-07, "kl": 0.04742431640625, "learning_rate": 1.4299859211486429e-05, "loss": 0.0019, "num_tokens": 84412489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1269 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.42333333333333334, "grad_norm": 1.2475776145493e-07, "kl": 0.0467529296875, "learning_rate": 1.4289351334031461e-05, "loss": 0.0019, "num_tokens": 84486201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1270 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4236666666666667, "grad_norm": 1.92818504274328e-07, "kl": 0.04327392578125, "learning_rate": 1.4278837649416543e-05, "loss": 0.0017, "num_tokens": 84562777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1271 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.424, "grad_norm": 1.1781333597582488e-07, "kl": 0.04620361328125, "learning_rate": 1.4268318171875683e-05, "loss": 0.0018, "num_tokens": 84637529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1272 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.42433333333333334, "grad_norm": 1.0294044727743312e-07, "kl": 0.042236328125, "learning_rate": 1.4257792915650728e-05, "loss": 0.0017, "num_tokens": 84715337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1273 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4246666666666667, "grad_norm": 1.5438126865774393e-07, "kl": 0.0435791015625, "learning_rate": 1.4247261894991344e-05, "loss": 0.0017, "num_tokens": 84791577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1274 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.425, "grad_norm": 1.4589319619062735e-07, "kl": 0.0506591796875, "learning_rate": 1.4236725124155015e-05, "loss": 0.002, "num_tokens": 84872169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1275 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.42533333333333334, "grad_norm": 2.2272581645665923e-07, "kl": 0.0458984375, "learning_rate": 1.4226182617406996e-05, "loss": 0.0018, "num_tokens": 84950089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1276 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4256666666666667, "grad_norm": 1.4727811503689736e-07, "kl": 0.04754638671875, "learning_rate": 1.4215634389020314e-05, "loss": 0.0019, "num_tokens": 85026921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1277 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.426, "grad_norm": 1.4976529882915202e-07, "kl": 0.04833984375, "learning_rate": 1.4205080453275739e-05, "loss": 0.0019, "num_tokens": 85103625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1278 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.42633333333333334, "grad_norm": 1.0492107094250969e-07, "kl": 0.047607421875, "learning_rate": 1.4194520824461773e-05, "loss": 0.0019, "num_tokens": 85176809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1279 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4266666666666667, "grad_norm": 1.578942203650513e-07, "kl": 0.04119873046875, "learning_rate": 1.4183955516874624e-05, "loss": 0.0016, "num_tokens": 85254489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1280 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.427, "grad_norm": 1.8067058249471302e-07, "kl": 0.04754638671875, "learning_rate": 1.417338454481818e-05, "loss": 0.0019, "num_tokens": 85329529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1281 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.42733333333333334, "grad_norm": 1.1183752945953529e-07, "kl": 0.04620361328125, "learning_rate": 1.4162807922604014e-05, "loss": 0.0018, "num_tokens": 85404089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1282 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.42766666666666664, "grad_norm": 1.6645712719309813e-07, "kl": 0.04473876953125, "learning_rate": 1.4152225664551333e-05, "loss": 0.0018, "num_tokens": 85479001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1283 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.428, "grad_norm": 1.1262783772281182e-07, "kl": 0.04620361328125, "learning_rate": 1.4141637784986984e-05, "loss": 0.0019, "num_tokens": 85553657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1284 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.42833333333333334, "grad_norm": 1.2058814036208787e-07, "kl": 0.0472412109375, "learning_rate": 1.413104429824542e-05, "loss": 0.0019, "num_tokens": 85628217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1285 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.42866666666666664, "grad_norm": 1.3350576466564235e-07, "kl": 0.04296875, "learning_rate": 1.4120445218668687e-05, "loss": 0.0017, "num_tokens": 85703209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1286 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.429, "grad_norm": 1.0737957722994906e-07, "kl": 0.05096435546875, "learning_rate": 1.4109840560606397e-05, "loss": 0.002, "num_tokens": 85778473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1287 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.42933333333333334, "grad_norm": 1.1782927344938798e-07, "kl": 0.04388427734375, "learning_rate": 1.4099230338415728e-05, "loss": 0.0018, "num_tokens": 85852473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1288 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.42966666666666664, "grad_norm": 9.832595537773159e-08, "kl": 0.04364013671875, "learning_rate": 1.408861456646138e-05, "loss": 0.0017, "num_tokens": 85925849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1289 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43, "grad_norm": 1.0591865873266215e-07, "kl": 0.04486083984375, "learning_rate": 1.4077993259115568e-05, "loss": 0.0018, "num_tokens": 86000713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1290 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43033333333333335, "grad_norm": 2.2875700267377397e-07, "kl": 0.0421142578125, "learning_rate": 1.4067366430758004e-05, "loss": 0.0017, "num_tokens": 86077049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1291 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43066666666666664, "grad_norm": 2.0644961296056863e-07, "kl": 0.0439453125, "learning_rate": 1.405673409577587e-05, "loss": 0.0018, "num_tokens": 86154377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1292 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.431, "grad_norm": 2.0429293101642543e-07, "kl": 0.0460205078125, "learning_rate": 1.4046096268563814e-05, "loss": 0.0018, "num_tokens": 86228985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1293 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43133333333333335, "grad_norm": 1.844732508970992e-07, "kl": 0.04315185546875, "learning_rate": 1.4035452963523903e-05, "loss": 0.0017, "num_tokens": 86306745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1294 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43166666666666664, "grad_norm": 1.8191859396665677e-07, "kl": 0.047119140625, "learning_rate": 1.402480419506563e-05, "loss": 0.0019, "num_tokens": 86382777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1295 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.432, "grad_norm": 9.757667385201785e-08, "kl": 0.044677734375, "learning_rate": 1.4014149977605893e-05, "loss": 0.0018, "num_tokens": 86456633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1296 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43233333333333335, "grad_norm": 2.0536421629913093e-07, "kl": 0.04278564453125, "learning_rate": 1.4003490325568953e-05, "loss": 0.0017, "num_tokens": 86532601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1297 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43266666666666664, "grad_norm": 1157.24365234375, "kl": 16.53192138671875, "learning_rate": 1.3992825253386428e-05, "loss": 0.6595, "num_tokens": 86612905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1298 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.433, "grad_norm": 7.503676613396237e-08, "kl": 0.0423583984375, "learning_rate": 1.3982154775497287e-05, "loss": 0.0017, "num_tokens": 86686489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1299 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43333333333333335, "grad_norm": 5.4179398745191065e-08, "kl": 0.0374755859375, "learning_rate": 1.3971478906347806e-05, "loss": 0.0015, "num_tokens": 86762089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1300 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43366666666666664, "grad_norm": 4.304170531099771e-08, "kl": 0.044921875, "learning_rate": 1.396079766039157e-05, "loss": 0.0018, "num_tokens": 86839145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1301 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.434, "grad_norm": 3.2212387424124245e-08, "kl": 0.04461669921875, "learning_rate": 1.3950111052089432e-05, "loss": 0.0018, "num_tokens": 86912729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1302 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43433333333333335, "grad_norm": 1.4443187978940841e-08, "kl": 0.04534912109375, "learning_rate": 1.3939419095909513e-05, "loss": 0.0018, "num_tokens": 86986969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1303 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43466666666666665, "grad_norm": 1.2370498403413421e-08, "kl": 0.0509033203125, "learning_rate": 1.3928721806327173e-05, "loss": 0.002, "num_tokens": 87060809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.435, "grad_norm": 2.5710283679813983e-08, "kl": 0.0479736328125, "learning_rate": 1.3918019197824985e-05, "loss": 0.0019, "num_tokens": 87136425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1305 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43533333333333335, "grad_norm": 1.757689993553413e-08, "kl": 0.04754638671875, "learning_rate": 1.3907311284892737e-05, "loss": 0.0019, "num_tokens": 87213049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1306 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43566666666666665, "grad_norm": 5.390435209307043e-09, "kl": 0.0445556640625, "learning_rate": 1.389659808202739e-05, "loss": 0.0018, "num_tokens": 87287529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1307 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.436, "grad_norm": 8.44595682281124e-09, "kl": 0.04620361328125, "learning_rate": 1.3885879603733066e-05, "loss": 0.0018, "num_tokens": 87362745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1308 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43633333333333335, "grad_norm": 6.079368120026629e-09, "kl": 0.04791259765625, "learning_rate": 1.3875155864521031e-05, "loss": 0.0019, "num_tokens": 87443769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1309 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43666666666666665, "grad_norm": 9.99597205009195e-09, "kl": 0.04534912109375, "learning_rate": 1.3864426878909674e-05, "loss": 0.0018, "num_tokens": 87520905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1310 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.437, "grad_norm": 5.980236306157849e-09, "kl": 0.04656982421875, "learning_rate": 1.3853692661424485e-05, "loss": 0.0019, "num_tokens": 87594425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1311 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43733333333333335, "grad_norm": 4.2112660025850346e-09, "kl": 0.04803466796875, "learning_rate": 1.3842953226598036e-05, "loss": 0.0019, "num_tokens": 87669177.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1312 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43766666666666665, "grad_norm": 3.886565291821853e-09, "kl": 0.04547119140625, "learning_rate": 1.3832208588969975e-05, "loss": 0.0018, "num_tokens": 87743449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1313 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.438, "grad_norm": 2.714927394009692e-09, "kl": 0.04595947265625, "learning_rate": 1.3821458763086973e-05, "loss": 0.0018, "num_tokens": 87818073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1314 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43833333333333335, "grad_norm": 2.8266256002495993e-09, "kl": 0.04498291015625, "learning_rate": 1.3810703763502744e-05, "loss": 0.0018, "num_tokens": 87892873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1315 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43866666666666665, "grad_norm": 2.116833153209541e-09, "kl": 0.04107666015625, "learning_rate": 1.3799943604777993e-05, "loss": 0.0016, "num_tokens": 87968873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.439, "grad_norm": 3.138934001611915e-09, "kl": 0.04656982421875, "learning_rate": 1.3789178301480415e-05, "loss": 0.0019, "num_tokens": 88043081.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43933333333333335, "grad_norm": 3.457683028784686e-09, "kl": 0.04510498046875, "learning_rate": 1.3778407868184674e-05, "loss": 0.0018, "num_tokens": 88119225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1318 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43966666666666665, "grad_norm": 2.765699003148825e-09, "kl": 0.04766845703125, "learning_rate": 1.3767632319472373e-05, "loss": 0.0019, "num_tokens": 88198617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44, "grad_norm": 2.8450444222727356e-09, "kl": 0.0477294921875, "learning_rate": 1.375685166993204e-05, "loss": 0.0019, "num_tokens": 88274873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1320 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44033333333333335, "grad_norm": 1.6249828149739187e-09, "kl": 0.04498291015625, "learning_rate": 1.3746065934159123e-05, "loss": 0.0018, "num_tokens": 88352441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1321 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44066666666666665, "grad_norm": 1.7124592854855791e-09, "kl": 0.04473876953125, "learning_rate": 1.3735275126755933e-05, "loss": 0.0018, "num_tokens": 88427785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.441, "grad_norm": 1.8947714508499303e-09, "kl": 0.0467529296875, "learning_rate": 1.3724479262331662e-05, "loss": 0.0019, "num_tokens": 88501769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1323 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44133333333333336, "grad_norm": 1.7100232341249466e-09, "kl": 0.05010986328125, "learning_rate": 1.371367835550235e-05, "loss": 0.002, "num_tokens": 88585097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1324 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44166666666666665, "grad_norm": 1.8501377097024374e-09, "kl": 0.04571533203125, "learning_rate": 1.3702872420890853e-05, "loss": 0.0018, "num_tokens": 88660857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1325 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.442, "grad_norm": 2.5744724130305485e-09, "kl": 0.04754638671875, "learning_rate": 1.3692061473126845e-05, "loss": 0.0019, "num_tokens": 88738873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1326 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44233333333333336, "grad_norm": 1.300720864350069e-09, "kl": 0.04833984375, "learning_rate": 1.3681245526846782e-05, "loss": 0.0019, "num_tokens": 88812921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1327 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44266666666666665, "grad_norm": 2.000139165403425e-09, "kl": 0.045654296875, "learning_rate": 1.3670424596693884e-05, "loss": 0.0018, "num_tokens": 88888297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1328 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.443, "grad_norm": 1.4673513515361947e-09, "kl": 0.0445556640625, "learning_rate": 1.3659598697318122e-05, "loss": 0.0018, "num_tokens": 88962489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1329 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44333333333333336, "grad_norm": 2.1496429081224733e-09, "kl": 0.04644775390625, "learning_rate": 1.3648767843376196e-05, "loss": 0.0019, "num_tokens": 89038297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1330 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44366666666666665, "grad_norm": 2.0196293526453246e-09, "kl": 0.04058837890625, "learning_rate": 1.3637932049531517e-05, "loss": 0.0016, "num_tokens": 89118265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1331 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.444, "grad_norm": 1.716194519829628e-09, "kl": 0.04852294921875, "learning_rate": 1.3627091330454172e-05, "loss": 0.0019, "num_tokens": 89193257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1332 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44433333333333336, "grad_norm": 1.683356787296475e-09, "kl": 0.04669189453125, "learning_rate": 1.3616245700820922e-05, "loss": 0.0019, "num_tokens": 89268697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1333 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44466666666666665, "grad_norm": 1.1414635903150838e-09, "kl": 0.04693603515625, "learning_rate": 1.3605395175315188e-05, "loss": 0.0019, "num_tokens": 89346873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1334 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.445, "grad_norm": 1.3487523320421246e-09, "kl": 0.041168212890625, "learning_rate": 1.3594539768627e-05, "loss": 0.0016, "num_tokens": 89424377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1335 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44533333333333336, "grad_norm": 1.5093900573859287e-09, "kl": 0.04534912109375, "learning_rate": 1.3583679495453e-05, "loss": 0.0018, "num_tokens": 89499353.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1336 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44566666666666666, "grad_norm": 1.5548657916752973e-09, "kl": 0.04779052734375, "learning_rate": 1.3572814370496441e-05, "loss": 0.0019, "num_tokens": 89576169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1337 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.446, "grad_norm": 1.6952255155189278e-09, "kl": 0.04583740234375, "learning_rate": 1.3561944408467112e-05, "loss": 0.0018, "num_tokens": 89650089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1338 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44633333333333336, "grad_norm": 1.664295923298198e-09, "kl": 0.04608154296875, "learning_rate": 1.3551069624081372e-05, "loss": 0.0018, "num_tokens": 89725305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1339 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44666666666666666, "grad_norm": 1.7554038223011048e-09, "kl": 0.0465087890625, "learning_rate": 1.3540190032062102e-05, "loss": 0.0019, "num_tokens": 89799497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1340 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.447, "grad_norm": 1.3865159020909346e-09, "kl": 0.04388427734375, "learning_rate": 1.3529305647138689e-05, "loss": 0.0018, "num_tokens": 89873033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1341 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44733333333333336, "grad_norm": 1.2171670338290141e-09, "kl": 0.04754638671875, "learning_rate": 1.3518416484047018e-05, "loss": 0.0019, "num_tokens": 89946921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1342 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44766666666666666, "grad_norm": 1.6039399808320809e-09, "kl": 0.04608154296875, "learning_rate": 1.3507522557529438e-05, "loss": 0.0018, "num_tokens": 90021705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1343 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.448, "grad_norm": 1.9966848174846064e-09, "kl": 0.0465087890625, "learning_rate": 1.3496623882334738e-05, "loss": 0.0019, "num_tokens": 90098553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1344 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4483333333333333, "grad_norm": 1.5451780965847206e-09, "kl": 0.04833984375, "learning_rate": 1.3485720473218153e-05, "loss": 0.0019, "num_tokens": 90172921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1345 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44866666666666666, "grad_norm": 1.5867720470907898e-09, "kl": 0.04302978515625, "learning_rate": 1.3474812344941315e-05, "loss": 0.0017, "num_tokens": 90250441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1346 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.449, "grad_norm": 9.50118317355475e-10, "kl": 0.04559326171875, "learning_rate": 1.3463899512272249e-05, "loss": 0.0018, "num_tokens": 90324649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1347 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4493333333333333, "grad_norm": 1.4622596467006588e-09, "kl": 0.04290771484375, "learning_rate": 1.3452981989985347e-05, "loss": 0.0017, "num_tokens": 90399945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1348 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.44966666666666666, "grad_norm": 1.6429176907806209e-09, "kl": 0.04595947265625, "learning_rate": 1.3442059792861356e-05, "loss": 0.0018, "num_tokens": 90474921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1349 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45, "grad_norm": 1.171096331020749e-09, "kl": 0.047607421875, "learning_rate": 1.343113293568734e-05, "loss": 0.0019, "num_tokens": 90549945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1350 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4503333333333333, "grad_norm": 2.0986277160517375e-09, "kl": 0.039306640625, "learning_rate": 1.342020143325669e-05, "loss": 0.0016, "num_tokens": 90628921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1351 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45066666666666666, "grad_norm": 3.771317036438404e-09, "kl": 0.04345703125, "learning_rate": 1.3409265300369065e-05, "loss": 0.0017, "num_tokens": 90706633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1352 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.451, "grad_norm": 2.9408953050591435e-09, "kl": 0.04150390625, "learning_rate": 1.3398324551830416e-05, "loss": 0.0017, "num_tokens": 90784633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4513333333333333, "grad_norm": 1.4011295457194706e-09, "kl": 0.04315185546875, "learning_rate": 1.3387379202452917e-05, "loss": 0.0017, "num_tokens": 90862345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1354 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45166666666666666, "grad_norm": 1.1908668495763663e-09, "kl": 0.04931640625, "learning_rate": 1.3376429267054991e-05, "loss": 0.002, "num_tokens": 90938377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.452, "grad_norm": 2.4293096423377847e-09, "kl": 0.04376220703125, "learning_rate": 1.3365474760461265e-05, "loss": 0.0018, "num_tokens": 91014329.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1356 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4523333333333333, "grad_norm": 2.149812106111426e-09, "kl": 0.048095703125, "learning_rate": 1.3354515697502552e-05, "loss": 0.0019, "num_tokens": 91090825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1357 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45266666666666666, "grad_norm": 1.3354113370667164e-09, "kl": 0.04547119140625, "learning_rate": 1.3343552093015833e-05, "loss": 0.0018, "num_tokens": 91165225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1358 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.453, "grad_norm": 1.2926862913431592e-09, "kl": 0.045654296875, "learning_rate": 1.3332583961844243e-05, "loss": 0.0018, "num_tokens": 91239993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1359 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4533333333333333, "grad_norm": 1.5499076466696238e-09, "kl": 0.04150390625, "learning_rate": 1.3321611318837033e-05, "loss": 0.0017, "num_tokens": 91319433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1360 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45366666666666666, "grad_norm": 2.1318795617730757e-09, "kl": 0.04510498046875, "learning_rate": 1.3310634178849583e-05, "loss": 0.0018, "num_tokens": 91394857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1361 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.454, "grad_norm": 1.2255058079446712e-09, "kl": 0.04852294921875, "learning_rate": 1.3299652556743341e-05, "loss": 0.0019, "num_tokens": 91470153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1362 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4543333333333333, "grad_norm": 1.1797572918581523e-09, "kl": 0.04364013671875, "learning_rate": 1.3288666467385834e-05, "loss": 0.0017, "num_tokens": 91544697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1363 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45466666666666666, "grad_norm": 2.0274464329617103e-09, "kl": 0.0445556640625, "learning_rate": 1.3277675925650635e-05, "loss": 0.0018, "num_tokens": 91621577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1364 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.455, "grad_norm": 1.3492394979053302e-09, "kl": 0.045654296875, "learning_rate": 1.3266680946417346e-05, "loss": 0.0018, "num_tokens": 91696281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1365 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4553333333333333, "grad_norm": 1.99545913126542e-09, "kl": 0.04644775390625, "learning_rate": 1.3255681544571568e-05, "loss": 0.0019, "num_tokens": 91772937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1366 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45566666666666666, "grad_norm": 1.376682767784132e-09, "kl": 0.04632568359375, "learning_rate": 1.3244677735004904e-05, "loss": 0.0019, "num_tokens": 91846793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1367 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.456, "grad_norm": 1.3739618331953807e-09, "kl": 0.04718017578125, "learning_rate": 1.3233669532614914e-05, "loss": 0.0019, "num_tokens": 91921753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1368 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4563333333333333, "grad_norm": 1.5161204514058113e-09, "kl": 0.048828125, "learning_rate": 1.3222656952305113e-05, "loss": 0.0019, "num_tokens": 91997033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45666666666666667, "grad_norm": 1.7136055907585046e-09, "kl": 0.044189453125, "learning_rate": 1.3211640008984934e-05, "loss": 0.0018, "num_tokens": 92072361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1370 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.457, "grad_norm": 1.2126788462296645e-09, "kl": 0.04656982421875, "learning_rate": 1.3200618717569716e-05, "loss": 0.0019, "num_tokens": 92147673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1371 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4573333333333333, "grad_norm": 1.926088177839347e-09, "kl": 0.04302978515625, "learning_rate": 1.3189593092980701e-05, "loss": 0.0017, "num_tokens": 92223433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1372 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45766666666666667, "grad_norm": 1.3767078588244885e-09, "kl": 0.04730224609375, "learning_rate": 1.317856315014498e-05, "loss": 0.0019, "num_tokens": 92298665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1373 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.458, "grad_norm": 1.9467807366169154e-09, "kl": 0.04718017578125, "learning_rate": 1.3167528903995497e-05, "loss": 0.0019, "num_tokens": 92373465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1374 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4583333333333333, "grad_norm": 1.3485310645933168e-09, "kl": 0.04571533203125, "learning_rate": 1.3156490369471026e-05, "loss": 0.0018, "num_tokens": 92447449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1375 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45866666666666667, "grad_norm": 1.0348388812531084e-09, "kl": 0.04608154296875, "learning_rate": 1.3145447561516138e-05, "loss": 0.0018, "num_tokens": 92521497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1376 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.459, "grad_norm": 1.3868521886450935e-09, "kl": 0.0450439453125, "learning_rate": 1.3134400495081197e-05, "loss": 0.0018, "num_tokens": 92595625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1377 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4593333333333333, "grad_norm": 1.4668779524384945e-09, "kl": 0.04315185546875, "learning_rate": 1.3123349185122328e-05, "loss": 0.0017, "num_tokens": 92670505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1378 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45966666666666667, "grad_norm": 2.1983377340717425e-09, "kl": 0.0445556640625, "learning_rate": 1.3112293646601402e-05, "loss": 0.0018, "num_tokens": 92747241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1379 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.46, "grad_norm": 2.091724571329223e-09, "kl": 0.048583984375, "learning_rate": 1.3101233894486018e-05, "loss": 0.0019, "num_tokens": 92831817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1380 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4603333333333333, "grad_norm": 2.2024604362513855e-09, "kl": 0.04669189453125, "learning_rate": 1.3090169943749475e-05, "loss": 0.0019, "num_tokens": 92907321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1381 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.46066666666666667, "grad_norm": 1.9012000862517198e-09, "kl": 0.0462646484375, "learning_rate": 1.307910180937076e-05, "loss": 0.0018, "num_tokens": 92984617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1382 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.461, "grad_norm": 3.703891193751474e-09, "kl": 0.046142578125, "learning_rate": 1.3068029506334526e-05, "loss": 0.0018, "num_tokens": 93060905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1383 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4613333333333333, "grad_norm": 1.9161636721776176e-09, "kl": 0.0465087890625, "learning_rate": 1.3056953049631059e-05, "loss": 0.0019, "num_tokens": 93139593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1384 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.46166666666666667, "grad_norm": 2.7307713867941175e-09, "kl": 0.0450439453125, "learning_rate": 1.3045872454256278e-05, "loss": 0.0018, "num_tokens": 93214057.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1385 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.462, "grad_norm": 2.5718513985140135e-09, "kl": 0.047607421875, "learning_rate": 1.3034787735211708e-05, "loss": 0.0019, "num_tokens": 93289721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1386 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4623333333333333, "grad_norm": 9.667121547707325e-10, "kl": 0.04705810546875, "learning_rate": 1.3023698907504447e-05, "loss": 0.0019, "num_tokens": 93366169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1387 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.46266666666666667, "grad_norm": 1.2620564593390782e-09, "kl": 0.044189453125, "learning_rate": 1.301260598614716e-05, "loss": 0.0018, "num_tokens": 93441017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1388 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.463, "grad_norm": 2.6096655947327463e-09, "kl": 0.04632568359375, "learning_rate": 1.3001508986158057e-05, "loss": 0.0019, "num_tokens": 93517273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1389 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4633333333333333, "grad_norm": 1.2768710533350713e-09, "kl": 0.04595947265625, "learning_rate": 1.2990407922560869e-05, "loss": 0.0018, "num_tokens": 93595337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1390 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.46366666666666667, "grad_norm": 2.0450832138863007e-09, "kl": 0.04803466796875, "learning_rate": 1.297930281038482e-05, "loss": 0.0019, "num_tokens": 93671049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1391 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.464, "grad_norm": 2.2590371795416786e-09, "kl": 0.0479736328125, "learning_rate": 1.2968193664664633e-05, "loss": 0.0019, "num_tokens": 93747337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1392 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4643333333333333, "grad_norm": 4.018368748859302e-09, "kl": 0.04583740234375, "learning_rate": 1.2957080500440469e-05, "loss": 0.0018, "num_tokens": 93825273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1393 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4646666666666667, "grad_norm": 1.642464053652759e-09, "kl": 0.0428466796875, "learning_rate": 1.2945963332757949e-05, "loss": 0.0017, "num_tokens": 93901033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1394 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.465, "grad_norm": 2.3243225122371314e-09, "kl": 0.0474853515625, "learning_rate": 1.2934842176668105e-05, "loss": 0.0019, "num_tokens": 93976505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1395 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4653333333333333, "grad_norm": 1.3201009174679257e-09, "kl": 0.0465087890625, "learning_rate": 1.2923717047227368e-05, "loss": 0.0019, "num_tokens": 94051913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1396 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4656666666666667, "grad_norm": 1.878713185021752e-09, "kl": 0.04595947265625, "learning_rate": 1.2912587959497556e-05, "loss": 0.0018, "num_tokens": 94128025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1397 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.466, "grad_norm": 1.6010204273442241e-09, "kl": 0.04608154296875, "learning_rate": 1.2901454928545834e-05, "loss": 0.0018, "num_tokens": 94204345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1398 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4663333333333333, "grad_norm": 1.8525165845773017e-09, "kl": 0.0452880859375, "learning_rate": 1.2890317969444716e-05, "loss": 0.0018, "num_tokens": 94281161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1399 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4666666666666667, "grad_norm": 1.9988819488503395e-09, "kl": 0.04534912109375, "learning_rate": 1.2879177097272033e-05, "loss": 0.0018, "num_tokens": 94357209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1400 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.467, "grad_norm": 2.9500384357561416e-09, "kl": 0.04437255859375, "learning_rate": 1.2868032327110904e-05, "loss": 0.0018, "num_tokens": 94436153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1401 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4673333333333333, "grad_norm": 1.4637302481190773e-09, "kl": 0.0450439453125, "learning_rate": 1.2856883674049736e-05, "loss": 0.0018, "num_tokens": 94515241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1402 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4676666666666667, "grad_norm": 2.4397126541231273e-09, "kl": 0.04730224609375, "learning_rate": 1.2845731153182191e-05, "loss": 0.0019, "num_tokens": 94590969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1403 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.468, "grad_norm": 1.1360166141116679e-09, "kl": 0.0435791015625, "learning_rate": 1.2834574779607163e-05, "loss": 0.0017, "num_tokens": 94669017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1404 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4683333333333333, "grad_norm": 2.3172164187457156e-09, "kl": 0.0452880859375, "learning_rate": 1.2823414568428767e-05, "loss": 0.0018, "num_tokens": 94746665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1405 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4686666666666667, "grad_norm": 1.2102564506122349e-09, "kl": 0.042724609375, "learning_rate": 1.2812250534756307e-05, "loss": 0.0017, "num_tokens": 94821273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1406 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.469, "grad_norm": 2.793179021409742e-09, "kl": 0.048095703125, "learning_rate": 1.2801082693704272e-05, "loss": 0.0019, "num_tokens": 94899001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1407 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4693333333333333, "grad_norm": 1.588487674730743e-09, "kl": 0.04803466796875, "learning_rate": 1.2789911060392295e-05, "loss": 0.0019, "num_tokens": 94973593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1408 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4696666666666667, "grad_norm": 1.8864887429970167e-09, "kl": 0.04559326171875, "learning_rate": 1.277873564994515e-05, "loss": 0.0018, "num_tokens": 95048153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1409 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.47, "grad_norm": 1.6315002682176782e-09, "kl": 0.04736328125, "learning_rate": 1.2767556477492722e-05, "loss": 0.0019, "num_tokens": 95121977.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1410 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4703333333333333, "grad_norm": 1.7701125010205487e-09, "kl": 0.047119140625, "learning_rate": 1.2756373558169992e-05, "loss": 0.0019, "num_tokens": 95196649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1411 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4706666666666667, "grad_norm": 1.5310451795258473e-09, "kl": 0.04754638671875, "learning_rate": 1.274518690711701e-05, "loss": 0.0019, "num_tokens": 95272137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1412 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.471, "grad_norm": 1.2528541537548676e-09, "kl": 0.0465087890625, "learning_rate": 1.2733996539478883e-05, "loss": 0.0019, "num_tokens": 95346665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1413 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4713333333333333, "grad_norm": 2.6074351566762743e-09, "kl": 0.04315185546875, "learning_rate": 1.2722802470405744e-05, "loss": 0.0017, "num_tokens": 95427961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1414 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4716666666666667, "grad_norm": 1.394880433380763e-09, "kl": 0.04388427734375, "learning_rate": 1.271160471505274e-05, "loss": 0.0018, "num_tokens": 95502329.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1415 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.472, "grad_norm": 1.7276177155522987e-09, "kl": 0.04766845703125, "learning_rate": 1.270040328858001e-05, "loss": 0.0019, "num_tokens": 95578633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1416 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4723333333333333, "grad_norm": 1.752081701944519e-09, "kl": 0.04718017578125, "learning_rate": 1.2689198206152657e-05, "loss": 0.0019, "num_tokens": 95653945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1417 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4726666666666667, "grad_norm": 1.3293908196487791e-09, "kl": 0.04864501953125, "learning_rate": 1.2677989482940747e-05, "loss": 0.0019, "num_tokens": 95728505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1418 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.473, "grad_norm": 1.7600793045247087e-09, "kl": 0.0416259765625, "learning_rate": 1.2666777134119257e-05, "loss": 0.0017, "num_tokens": 95805289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1419 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.47333333333333333, "grad_norm": 1.7750138026073614e-09, "kl": 0.0465087890625, "learning_rate": 1.265556117486809e-05, "loss": 0.0019, "num_tokens": 95882201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1420 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4736666666666667, "grad_norm": 1.3607460713771502e-09, "kl": 0.046142578125, "learning_rate": 1.2644341620372025e-05, "loss": 0.0018, "num_tokens": 95957113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1421 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.474, "grad_norm": 1.334010124587337e-09, "kl": 0.041748046875, "learning_rate": 1.2633118485820713e-05, "loss": 0.0017, "num_tokens": 96031465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1422 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.47433333333333333, "grad_norm": 1.2358801759759785e-09, "kl": 0.04290771484375, "learning_rate": 1.2621891786408648e-05, "loss": 0.0017, "num_tokens": 96106201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1423 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4746666666666667, "grad_norm": 1.6416017434295327e-09, "kl": 0.0478515625, "learning_rate": 1.2610661537335163e-05, "loss": 0.0019, "num_tokens": 96182201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.475, "grad_norm": 2.3159394402227917e-09, "kl": 0.0465087890625, "learning_rate": 1.2599427753804377e-05, "loss": 0.0019, "num_tokens": 96266169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1425 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.47533333333333333, "grad_norm": 1.1112566422610826e-09, "kl": 0.043212890625, "learning_rate": 1.2588190451025209e-05, "loss": 0.0017, "num_tokens": 96339449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1426 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4756666666666667, "grad_norm": 1.889778777908191e-09, "kl": 0.04736328125, "learning_rate": 1.257694964421134e-05, "loss": 0.0019, "num_tokens": 96415865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1427 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.476, "grad_norm": 1.74467429392422e-09, "kl": 0.0526123046875, "learning_rate": 1.256570534858119e-05, "loss": 0.0021, "num_tokens": 96493193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1428 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.47633333333333333, "grad_norm": 2.509080054835522e-09, "kl": 0.04541015625, "learning_rate": 1.2554457579357906e-05, "loss": 0.0018, "num_tokens": 96571673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1429 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4766666666666667, "grad_norm": 1.0961703766909636e-09, "kl": 0.04876708984375, "learning_rate": 1.2543206351769341e-05, "loss": 0.002, "num_tokens": 96646409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1430 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.477, "grad_norm": 1.38895073220624e-09, "kl": 0.04443359375, "learning_rate": 1.253195168104802e-05, "loss": 0.0018, "num_tokens": 96719945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1431 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.47733333333333333, "grad_norm": 1.2896351764268843e-09, "kl": 0.0501708984375, "learning_rate": 1.252069358243114e-05, "loss": 0.002, "num_tokens": 96795097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1432 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4776666666666667, "grad_norm": 2.1438515407368186e-09, "kl": 0.04339599609375, "learning_rate": 1.2509432071160527e-05, "loss": 0.0017, "num_tokens": 96872585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1433 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.478, "grad_norm": 1.4982761697979186e-09, "kl": 0.0465087890625, "learning_rate": 1.2498167162482649e-05, "loss": 0.0019, "num_tokens": 96948329.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1434 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.47833333333333333, "grad_norm": 1.269591987096419e-09, "kl": 0.04742431640625, "learning_rate": 1.2486898871648552e-05, "loss": 0.0019, "num_tokens": 97023017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1435 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4786666666666667, "grad_norm": 2.026363521423491e-09, "kl": 0.04449462890625, "learning_rate": 1.2475627213913861e-05, "loss": 0.0018, "num_tokens": 97098281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1436 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.479, "grad_norm": 8.847265697831119e-10, "kl": 0.04180908203125, "learning_rate": 1.246435220453878e-05, "loss": 0.0017, "num_tokens": 97171961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1437 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.47933333333333333, "grad_norm": 1.5877311687617635e-09, "kl": 0.044921875, "learning_rate": 1.2453073858788027e-05, "loss": 0.0018, "num_tokens": 97247433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1438 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4796666666666667, "grad_norm": 2.2539907718055474e-09, "kl": 0.04840087890625, "learning_rate": 1.2441792191930856e-05, "loss": 0.0019, "num_tokens": 97323609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1439 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48, "grad_norm": 1.4704915063390445e-09, "kl": 0.044189453125, "learning_rate": 1.2430507219240997e-05, "loss": 0.0018, "num_tokens": 97399513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1440 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48033333333333333, "grad_norm": 1.8854446892646592e-09, "kl": 0.04400634765625, "learning_rate": 1.2419218955996677e-05, "loss": 0.0018, "num_tokens": 97476793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1441 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4806666666666667, "grad_norm": 2.3348922795207727e-09, "kl": 0.04443359375, "learning_rate": 1.2407927417480567e-05, "loss": 0.0018, "num_tokens": 97555769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1442 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.481, "grad_norm": 1.3160883494123254e-09, "kl": 0.047119140625, "learning_rate": 1.2396632618979772e-05, "loss": 0.0019, "num_tokens": 97629913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1443 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48133333333333334, "grad_norm": 1.3900840478697774e-09, "kl": 0.04241943359375, "learning_rate": 1.238533457578581e-05, "loss": 0.0017, "num_tokens": 97704793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1444 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4816666666666667, "grad_norm": 2.56626431216489e-09, "kl": 0.0445556640625, "learning_rate": 1.2374033303194597e-05, "loss": 0.0018, "num_tokens": 97778921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1445 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.482, "grad_norm": 1.310583419567024e-09, "kl": 0.04327392578125, "learning_rate": 1.2362728816506418e-05, "loss": 0.0017, "num_tokens": 97853273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1446 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48233333333333334, "grad_norm": 1.58570490071952e-09, "kl": 0.048583984375, "learning_rate": 1.23514211310259e-05, "loss": 0.0019, "num_tokens": 97929577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1447 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4826666666666667, "grad_norm": 2.092530371200496e-09, "kl": 0.0517578125, "learning_rate": 1.2340110262062024e-05, "loss": 0.0021, "num_tokens": 98006553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1448 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.483, "grad_norm": 1.4428417349776623e-09, "kl": 0.044921875, "learning_rate": 1.232879622492806e-05, "loss": 0.0018, "num_tokens": 98081129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1449 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48333333333333334, "grad_norm": 1.5800878383487316e-09, "kl": 0.04693603515625, "learning_rate": 1.2317479034941572e-05, "loss": 0.0019, "num_tokens": 98159705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1450 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4836666666666667, "grad_norm": 2.071462779085209e-09, "kl": 0.04217529296875, "learning_rate": 1.2306158707424402e-05, "loss": 0.0017, "num_tokens": 98238425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1451 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.484, "grad_norm": 1.5709021861098904e-09, "kl": 0.04278564453125, "learning_rate": 1.2294835257702629e-05, "loss": 0.0017, "num_tokens": 98315433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1452 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48433333333333334, "grad_norm": 1.7471090130172229e-09, "kl": 0.04803466796875, "learning_rate": 1.2283508701106559e-05, "loss": 0.0019, "num_tokens": 98390601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1453 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4846666666666667, "grad_norm": 1.4333209064076868e-09, "kl": 0.04132080078125, "learning_rate": 1.2272179052970711e-05, "loss": 0.0017, "num_tokens": 98464345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1454 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.485, "grad_norm": 2.2627255624740883e-09, "kl": 0.04541015625, "learning_rate": 1.2260846328633786e-05, "loss": 0.0018, "num_tokens": 98541241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1455 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48533333333333334, "grad_norm": 1.397131299540888e-09, "kl": 0.041015625, "learning_rate": 1.2249510543438652e-05, "loss": 0.0016, "num_tokens": 98616025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1456 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4856666666666667, "grad_norm": 2.239487706390264e-09, "kl": 0.0467529296875, "learning_rate": 1.2238171712732316e-05, "loss": 0.0019, "num_tokens": 98692217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1457 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.486, "grad_norm": 1.706297880765817e-09, "kl": 0.04638671875, "learning_rate": 1.2226829851865911e-05, "loss": 0.0019, "num_tokens": 98767865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1458 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48633333333333334, "grad_norm": 2.520240238723659e-09, "kl": 0.0458984375, "learning_rate": 1.2215484976194675e-05, "loss": 0.0018, "num_tokens": 98844345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1459 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4866666666666667, "grad_norm": 1.0638442349275579e-09, "kl": 0.04541015625, "learning_rate": 1.2204137101077924e-05, "loss": 0.0018, "num_tokens": 98917753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1460 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.487, "grad_norm": 1.5510051021294657e-09, "kl": 0.04351806640625, "learning_rate": 1.2192786241879033e-05, "loss": 0.0017, "num_tokens": 98993817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1461 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48733333333333334, "grad_norm": 1.8757706499172855e-09, "kl": 0.0430908203125, "learning_rate": 1.2181432413965428e-05, "loss": 0.0017, "num_tokens": 99068521.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1462 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4876666666666667, "grad_norm": 2.1416064477364216e-09, "kl": 0.047607421875, "learning_rate": 1.2170075632708538e-05, "loss": 0.0019, "num_tokens": 99145353.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1463 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.488, "grad_norm": 1.7173015232074818e-09, "kl": 0.0457763671875, "learning_rate": 1.21587159134838e-05, "loss": 0.0018, "num_tokens": 99220361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48833333333333334, "grad_norm": 1.9694794683999817e-09, "kl": 0.0418701171875, "learning_rate": 1.2147353271670634e-05, "loss": 0.0017, "num_tokens": 99297625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1465 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4886666666666667, "grad_norm": 1.8061607764963128e-09, "kl": 0.04156494140625, "learning_rate": 1.2135987722652403e-05, "loss": 0.0017, "num_tokens": 99374985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1466 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.489, "grad_norm": 1.2887576561482206e-09, "kl": 0.04608154296875, "learning_rate": 1.2124619281816413e-05, "loss": 0.0018, "num_tokens": 99453369.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1467 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48933333333333334, "grad_norm": 1.375928815328109e-09, "kl": 0.0478515625, "learning_rate": 1.211324796455389e-05, "loss": 0.0019, "num_tokens": 99528729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48966666666666664, "grad_norm": 2.256154374435937e-09, "kl": 0.04486083984375, "learning_rate": 1.210187378625994e-05, "loss": 0.0018, "num_tokens": 99603113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1469 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49, "grad_norm": 1.2393501780394445e-09, "kl": 0.044189453125, "learning_rate": 1.2090496762333565e-05, "loss": 0.0018, "num_tokens": 99675865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1470 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49033333333333334, "grad_norm": 1.9452728317048695e-09, "kl": 0.0457763671875, "learning_rate": 1.2079116908177592e-05, "loss": 0.0018, "num_tokens": 99752585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1471 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49066666666666664, "grad_norm": 1.5343085690844305e-09, "kl": 0.0455322265625, "learning_rate": 1.2067734239198707e-05, "loss": 0.0018, "num_tokens": 99827161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1472 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.491, "grad_norm": 1.8314056937640544e-09, "kl": 0.04486083984375, "learning_rate": 1.2056348770807386e-05, "loss": 0.0018, "num_tokens": 99903065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1473 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49133333333333334, "grad_norm": 2.139466603878759e-09, "kl": 0.046875, "learning_rate": 1.2044960518417902e-05, "loss": 0.0019, "num_tokens": 99983321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1474 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49166666666666664, "grad_norm": 2.123553333177597e-09, "kl": 0.048828125, "learning_rate": 1.2033569497448306e-05, "loss": 0.002, "num_tokens": 100064793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1475 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.492, "grad_norm": 2.2304345037582607e-09, "kl": 0.04730224609375, "learning_rate": 1.2022175723320382e-05, "loss": 0.0019, "num_tokens": 100141961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1476 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49233333333333335, "grad_norm": 1.4254857294559997e-09, "kl": 0.04522705078125, "learning_rate": 1.2010779211459649e-05, "loss": 0.0018, "num_tokens": 100217657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1477 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49266666666666664, "grad_norm": 1.55117751976519e-09, "kl": 0.04437255859375, "learning_rate": 1.1999379977295334e-05, "loss": 0.0018, "num_tokens": 100292649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1478 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.493, "grad_norm": 8.613340041208062e-10, "kl": 0.04339599609375, "learning_rate": 1.1987978036260346e-05, "loss": 0.0017, "num_tokens": 100366009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1479 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49333333333333335, "grad_norm": 1.5746163262164714e-09, "kl": 0.0487060546875, "learning_rate": 1.1976573403791263e-05, "loss": 0.0019, "num_tokens": 100441049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1480 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49366666666666664, "grad_norm": 1.8050065886399125e-09, "kl": 0.04571533203125, "learning_rate": 1.1965166095328302e-05, "loss": 0.0018, "num_tokens": 100518985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1481 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.494, "grad_norm": 1.8187547023984507e-09, "kl": 0.0498046875, "learning_rate": 1.1953756126315306e-05, "loss": 0.002, "num_tokens": 100597065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1482 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49433333333333335, "grad_norm": 1.4223089372933373e-09, "kl": 0.0433349609375, "learning_rate": 1.194234351219972e-05, "loss": 0.0017, "num_tokens": 100671449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1483 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49466666666666664, "grad_norm": 1.969487239961154e-09, "kl": 0.0477294921875, "learning_rate": 1.1930928268432569e-05, "loss": 0.0019, "num_tokens": 100747593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1484 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.495, "grad_norm": 1.3811733978741358e-09, "kl": 0.043701171875, "learning_rate": 1.1919510410468435e-05, "loss": 0.0017, "num_tokens": 100821961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1485 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49533333333333335, "grad_norm": 1.7434232946200723e-09, "kl": 0.04718017578125, "learning_rate": 1.190808995376545e-05, "loss": 0.0019, "num_tokens": 100898217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1486 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49566666666666664, "grad_norm": 1.7419132802842796e-09, "kl": 0.048828125, "learning_rate": 1.1896666913785248e-05, "loss": 0.002, "num_tokens": 100973625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1487 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.496, "grad_norm": 1.2846314012548987e-09, "kl": 0.04583740234375, "learning_rate": 1.1885241305992976e-05, "loss": 0.0018, "num_tokens": 101050489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1488 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49633333333333335, "grad_norm": 1.8690116121433675e-09, "kl": 0.046142578125, "learning_rate": 1.187381314585725e-05, "loss": 0.0018, "num_tokens": 101129929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1489 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49666666666666665, "grad_norm": 2.1334294331154524e-09, "kl": 0.04766845703125, "learning_rate": 1.1862382448850136e-05, "loss": 0.0019, "num_tokens": 101206057.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1490 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.497, "grad_norm": 1.670320215474419e-09, "kl": 0.048828125, "learning_rate": 1.1850949230447146e-05, "loss": 0.0019, "num_tokens": 101281177.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1491 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49733333333333335, "grad_norm": 3.2869327259987813e-09, "kl": 0.04791259765625, "learning_rate": 1.1839513506127202e-05, "loss": 0.0019, "num_tokens": 101359113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1492 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49766666666666665, "grad_norm": 9.076445706135416e-10, "kl": 0.04412841796875, "learning_rate": 1.1828075291372616e-05, "loss": 0.0018, "num_tokens": 101437545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1493 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.498, "grad_norm": 8.783641591847413e-10, "kl": 0.04290771484375, "learning_rate": 1.181663460166907e-05, "loss": 0.0017, "num_tokens": 101511961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1494 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49833333333333335, "grad_norm": 1.6328642882257327e-09, "kl": 0.0435791015625, "learning_rate": 1.1805191452505602e-05, "loss": 0.0017, "num_tokens": 101588121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1495 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49866666666666665, "grad_norm": 1.7981141020584346e-09, "kl": 0.0478515625, "learning_rate": 1.1793745859374575e-05, "loss": 0.0019, "num_tokens": 101663801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1496 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.499, "grad_norm": 1.9772374848514573e-09, "kl": 0.0482177734375, "learning_rate": 1.1782297837771668e-05, "loss": 0.0019, "num_tokens": 101739753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1497 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49933333333333335, "grad_norm": 1.7462925550049135e-09, "kl": 0.04656982421875, "learning_rate": 1.1770847403195836e-05, "loss": 0.0019, "num_tokens": 101813657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1498 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49966666666666665, "grad_norm": 1.8387565914324e-09, "kl": 0.04473876953125, "learning_rate": 1.175939457114931e-05, "loss": 0.0018, "num_tokens": 101889449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1499 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5, "grad_norm": 1.1449774461880224e-09, "kl": 0.04833984375, "learning_rate": 1.1747939357137568e-05, "loss": 0.0019, "num_tokens": 101964137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1500 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5003333333333333, "grad_norm": 1.7043400024618904e-09, "kl": 0.042724609375, "learning_rate": 1.1736481776669307e-05, "loss": 0.0017, "num_tokens": 102045065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1501 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5006666666666667, "grad_norm": 1.949060024486471e-09, "kl": 0.04730224609375, "learning_rate": 1.1725021845256426e-05, "loss": 0.0019, "num_tokens": 102120873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1502 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.501, "grad_norm": 1.60807422933118e-09, "kl": 0.0457763671875, "learning_rate": 1.171355957841402e-05, "loss": 0.0018, "num_tokens": 102198473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1503 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5013333333333333, "grad_norm": 1.6719630124839568e-09, "kl": 0.04345703125, "learning_rate": 1.1702094991660326e-05, "loss": 0.0017, "num_tokens": 102275065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1504 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5016666666666667, "grad_norm": 1.3148617750147196e-09, "kl": 0.047607421875, "learning_rate": 1.169062810051674e-05, "loss": 0.0019, "num_tokens": 102350153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1505 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.502, "grad_norm": 1.6983747741505795e-09, "kl": 0.04461669921875, "learning_rate": 1.1679158920507773e-05, "loss": 0.0018, "num_tokens": 102424297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1506 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5023333333333333, "grad_norm": 1.5975210043706056e-09, "kl": 0.0452880859375, "learning_rate": 1.1667687467161025e-05, "loss": 0.0018, "num_tokens": 102500105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1507 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5026666666666667, "grad_norm": 1.7908122762477774e-09, "kl": 0.04296875, "learning_rate": 1.1656213756007184e-05, "loss": 0.0017, "num_tokens": 102575481.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1508 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.503, "grad_norm": 1.2947768412985283e-09, "kl": 0.046142578125, "learning_rate": 1.1644737802579989e-05, "loss": 0.0018, "num_tokens": 102650009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1509 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5033333333333333, "grad_norm": 1.1567885538354972e-09, "kl": 0.0452880859375, "learning_rate": 1.1633259622416224e-05, "loss": 0.0018, "num_tokens": 102724025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1510 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5036666666666667, "grad_norm": 2.3768287338299388e-09, "kl": 0.04443359375, "learning_rate": 1.1621779231055677e-05, "loss": 0.0018, "num_tokens": 102799273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1511 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.504, "grad_norm": 1.5762814387088042e-09, "kl": 0.04400634765625, "learning_rate": 1.161029664404113e-05, "loss": 0.0018, "num_tokens": 102874105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1512 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5043333333333333, "grad_norm": 2.59026933235873e-09, "kl": 0.04815673828125, "learning_rate": 1.159881187691835e-05, "loss": 0.0019, "num_tokens": 102952185.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1513 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5046666666666667, "grad_norm": 1.1417922163303729e-09, "kl": 0.04241943359375, "learning_rate": 1.158732494523604e-05, "loss": 0.0017, "num_tokens": 103026521.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1514 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.505, "grad_norm": 1.2209561139897573e-09, "kl": 0.04296875, "learning_rate": 1.1575835864545844e-05, "loss": 0.0017, "num_tokens": 103102169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1515 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5053333333333333, "grad_norm": 2.1902164526466095e-09, "kl": 0.04669189453125, "learning_rate": 1.156434465040231e-05, "loss": 0.0019, "num_tokens": 103177225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5056666666666667, "grad_norm": 1.4495937783465251e-09, "kl": 0.0479736328125, "learning_rate": 1.1552851318362876e-05, "loss": 0.0019, "num_tokens": 103252441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1517 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.506, "grad_norm": 1.3732713854963663e-09, "kl": 0.04290771484375, "learning_rate": 1.154135588398785e-05, "loss": 0.0017, "num_tokens": 103328313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1518 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5063333333333333, "grad_norm": 1.4534400349930365e-09, "kl": 0.04730224609375, "learning_rate": 1.1529858362840383e-05, "loss": 0.0019, "num_tokens": 103403433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1519 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5066666666666667, "grad_norm": 9.694033353824238e-10, "kl": 0.0457763671875, "learning_rate": 1.151835877048645e-05, "loss": 0.0018, "num_tokens": 103477401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1520 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.507, "grad_norm": 1.4473785503454906e-09, "kl": 0.04449462890625, "learning_rate": 1.1506857122494832e-05, "loss": 0.0018, "num_tokens": 103557209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1521 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5073333333333333, "grad_norm": 1.565738094733149e-09, "kl": 0.04644775390625, "learning_rate": 1.1495353434437098e-05, "loss": 0.0019, "num_tokens": 103631513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1522 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5076666666666667, "grad_norm": 1.6531110924589143e-09, "kl": 0.0491943359375, "learning_rate": 1.1483847721887567e-05, "loss": 0.002, "num_tokens": 103706553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1523 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.508, "grad_norm": 1.7490170423073437e-09, "kl": 0.04669189453125, "learning_rate": 1.1472340000423313e-05, "loss": 0.0019, "num_tokens": 103779753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1524 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5083333333333333, "grad_norm": 1.7742116664720697e-09, "kl": 0.0445556640625, "learning_rate": 1.1460830285624119e-05, "loss": 0.0018, "num_tokens": 103859417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1525 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5086666666666667, "grad_norm": 1.2627474621496049e-09, "kl": 0.0455322265625, "learning_rate": 1.1449318593072468e-05, "loss": 0.0018, "num_tokens": 103937177.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1526 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.509, "grad_norm": 9.279628176983579e-10, "kl": 0.04559326171875, "learning_rate": 1.143780493835353e-05, "loss": 0.0018, "num_tokens": 104010777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1527 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5093333333333333, "grad_norm": 1.976980801288164e-09, "kl": 0.04974365234375, "learning_rate": 1.1426289337055119e-05, "loss": 0.002, "num_tokens": 104087145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1528 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5096666666666667, "grad_norm": 1.201163279951345e-09, "kl": 0.04742431640625, "learning_rate": 1.141477180476769e-05, "loss": 0.0019, "num_tokens": 104162793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1529 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.51, "grad_norm": 2.458331094246091e-09, "kl": 0.0447998046875, "learning_rate": 1.1403252357084315e-05, "loss": 0.0018, "num_tokens": 104241001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1530 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5103333333333333, "grad_norm": 1.587015741044695e-09, "kl": 0.04559326171875, "learning_rate": 1.1391731009600655e-05, "loss": 0.0018, "num_tokens": 104316841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1531 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5106666666666667, "grad_norm": 1.4630648914604194e-09, "kl": 0.042724609375, "learning_rate": 1.1380207777914946e-05, "loss": 0.0017, "num_tokens": 104392121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1532 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.511, "grad_norm": 1.488992262821398e-09, "kl": 0.04449462890625, "learning_rate": 1.1368682677627971e-05, "loss": 0.0018, "num_tokens": 104466873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1533 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5113333333333333, "grad_norm": 1.585642728230141e-09, "kl": 0.04608154296875, "learning_rate": 1.1357155724343046e-05, "loss": 0.0018, "num_tokens": 104540873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1534 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5116666666666667, "grad_norm": 1.531700433154981e-09, "kl": 0.04547119140625, "learning_rate": 1.1345626933665996e-05, "loss": 0.0018, "num_tokens": 104614393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1535 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.512, "grad_norm": 1.9156807251619057e-09, "kl": 0.05059814453125, "learning_rate": 1.1334096321205129e-05, "loss": 0.002, "num_tokens": 104690889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1536 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5123333333333333, "grad_norm": 3.2628990620509057e-09, "kl": 0.044921875, "learning_rate": 1.1322563902571227e-05, "loss": 0.0018, "num_tokens": 104767577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1537 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5126666666666667, "grad_norm": 1.294423901399e-09, "kl": 0.04449462890625, "learning_rate": 1.1311029693377511e-05, "loss": 0.0018, "num_tokens": 104841161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1538 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.513, "grad_norm": 2.1832582408620738e-09, "kl": 0.043212890625, "learning_rate": 1.1299493709239628e-05, "loss": 0.0017, "num_tokens": 104918233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1539 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5133333333333333, "grad_norm": 2.7833639837382407e-09, "kl": 0.0458984375, "learning_rate": 1.128795596577563e-05, "loss": 0.0018, "num_tokens": 104994233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1540 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5136666666666667, "grad_norm": 1.2344314459511452e-09, "kl": 0.04583740234375, "learning_rate": 1.127641647860595e-05, "loss": 0.0018, "num_tokens": 105068297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1541 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.514, "grad_norm": 2.1082129375571412e-09, "kl": 0.04412841796875, "learning_rate": 1.1264875263353375e-05, "loss": 0.0018, "num_tokens": 105143865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1542 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5143333333333333, "grad_norm": 2.350319716626359e-09, "kl": 0.04864501953125, "learning_rate": 1.1253332335643043e-05, "loss": 0.0019, "num_tokens": 105219481.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1543 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5146666666666667, "grad_norm": 1.685978023857615e-09, "kl": 0.0474853515625, "learning_rate": 1.1241787711102405e-05, "loss": 0.0019, "num_tokens": 105294489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1544 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.515, "grad_norm": 1.740410704442752e-09, "kl": 0.042724609375, "learning_rate": 1.1230241405361209e-05, "loss": 0.0017, "num_tokens": 105369945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1545 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5153333333333333, "grad_norm": 2.745652816216193e-09, "kl": 0.04925537109375, "learning_rate": 1.1218693434051475e-05, "loss": 0.002, "num_tokens": 105445465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1546 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5156666666666667, "grad_norm": 2.3614223909618204e-09, "kl": 0.0479736328125, "learning_rate": 1.1207143812807489e-05, "loss": 0.0019, "num_tokens": 105523945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1547 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.516, "grad_norm": 2.0211039508666317e-09, "kl": 0.049072265625, "learning_rate": 1.1195592557265757e-05, "loss": 0.002, "num_tokens": 105598361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1548 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5163333333333333, "grad_norm": 1.0911613834707623e-09, "kl": 0.044921875, "learning_rate": 1.1184039683065014e-05, "loss": 0.0018, "num_tokens": 105673401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1549 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5166666666666667, "grad_norm": 1.3093715001133432e-09, "kl": 0.0445556640625, "learning_rate": 1.1172485205846161e-05, "loss": 0.0018, "num_tokens": 105746281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1550 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.517, "grad_norm": 1.5854247914504072e-09, "kl": 0.04217529296875, "learning_rate": 1.1160929141252303e-05, "loss": 0.0017, "num_tokens": 105820873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1551 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5173333333333333, "grad_norm": 1.7735748425451447e-09, "kl": 0.0447998046875, "learning_rate": 1.1149371504928667e-05, "loss": 0.0018, "num_tokens": 105898697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1552 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5176666666666667, "grad_norm": 1.7111051464624438e-09, "kl": 0.0458984375, "learning_rate": 1.1137812312522618e-05, "loss": 0.0018, "num_tokens": 105975001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1553 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.518, "grad_norm": 1.2639812529968708e-09, "kl": 0.04449462890625, "learning_rate": 1.112625157968363e-05, "loss": 0.0018, "num_tokens": 106050265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1554 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5183333333333333, "grad_norm": 1.3364678252969497e-09, "kl": 0.04571533203125, "learning_rate": 1.1114689322063255e-05, "loss": 0.0018, "num_tokens": 106125721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1555 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5186666666666667, "grad_norm": 2.1918862280756457e-09, "kl": 0.047607421875, "learning_rate": 1.110312555531512e-05, "loss": 0.0019, "num_tokens": 106202105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1556 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.519, "grad_norm": 2.3662052317519056e-09, "kl": 0.0491943359375, "learning_rate": 1.109156029509488e-05, "loss": 0.002, "num_tokens": 106283257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1557 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5193333333333333, "grad_norm": 1.356117107498278e-09, "kl": 0.0482177734375, "learning_rate": 1.1079993557060228e-05, "loss": 0.0019, "num_tokens": 106356857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1558 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5196666666666667, "grad_norm": 1.7797547879894182e-09, "kl": 0.044921875, "learning_rate": 1.1068425356870853e-05, "loss": 0.0018, "num_tokens": 106430921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1559 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.52, "grad_norm": 6.506926553129233e-09, "kl": 0.05059814453125, "learning_rate": 1.1056855710188413e-05, "loss": 0.002, "num_tokens": 106514425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1560 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5203333333333333, "grad_norm": 2.244831209807785e-09, "kl": 0.046142578125, "learning_rate": 1.1045284632676535e-05, "loss": 0.0018, "num_tokens": 106590777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1561 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5206666666666667, "grad_norm": 2.2646711173024414e-09, "kl": 0.04278564453125, "learning_rate": 1.1033712140000787e-05, "loss": 0.0017, "num_tokens": 106667401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1562 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.521, "grad_norm": 1.2767026325022357e-09, "kl": 0.04571533203125, "learning_rate": 1.1022138247828638e-05, "loss": 0.0018, "num_tokens": 106742041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1563 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5213333333333333, "grad_norm": 1.7591560430574305e-09, "kl": 0.0428466796875, "learning_rate": 1.1010562971829464e-05, "loss": 0.0017, "num_tokens": 106818857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1564 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5216666666666666, "grad_norm": 1.4238425993795545e-09, "kl": 0.04779052734375, "learning_rate": 1.0998986327674515e-05, "loss": 0.0019, "num_tokens": 106893625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1565 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.522, "grad_norm": 1.7487855608067093e-09, "kl": 0.04510498046875, "learning_rate": 1.0987408331036879e-05, "loss": 0.0018, "num_tokens": 106971113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1566 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5223333333333333, "grad_norm": 1.294450546751591e-09, "kl": 0.04144287109375, "learning_rate": 1.0975828997591496e-05, "loss": 0.0017, "num_tokens": 107043545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1567 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5226666666666666, "grad_norm": 2.2019452927679595e-09, "kl": 0.04510498046875, "learning_rate": 1.09642483430151e-05, "loss": 0.0018, "num_tokens": 107117369.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1568 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.523, "grad_norm": 1.363847812463348e-09, "kl": 0.047119140625, "learning_rate": 1.0952666382986216e-05, "loss": 0.0019, "num_tokens": 107194345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1569 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5233333333333333, "grad_norm": 2.162380052794788e-09, "kl": 0.0447998046875, "learning_rate": 1.0941083133185146e-05, "loss": 0.0018, "num_tokens": 107272953.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1570 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5236666666666666, "grad_norm": 1.596255350122533e-09, "kl": 0.046875, "learning_rate": 1.0929498609293925e-05, "loss": 0.0019, "num_tokens": 107347769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1571 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.524, "grad_norm": 1.5163403865869896e-09, "kl": 0.0450439453125, "learning_rate": 1.0917912826996319e-05, "loss": 0.0018, "num_tokens": 107423145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1572 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5243333333333333, "grad_norm": 1.702663454672404e-09, "kl": 0.0465087890625, "learning_rate": 1.0906325801977804e-05, "loss": 0.0019, "num_tokens": 107499449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1573 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5246666666666666, "grad_norm": 1.6652705880915164e-09, "kl": 0.04718017578125, "learning_rate": 1.0894737549925525e-05, "loss": 0.0019, "num_tokens": 107576729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1574 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.525, "grad_norm": 1.6206673780772007e-09, "kl": 0.044189453125, "learning_rate": 1.08831480865283e-05, "loss": 0.0018, "num_tokens": 107651801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1575 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5253333333333333, "grad_norm": 2.1230595059762436e-09, "kl": 0.04888916015625, "learning_rate": 1.0871557427476585e-05, "loss": 0.002, "num_tokens": 107728089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1576 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5256666666666666, "grad_norm": 3.244532420509927e-09, "kl": 0.04852294921875, "learning_rate": 1.0859965588462442e-05, "loss": 0.0019, "num_tokens": 107806073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1577 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.526, "grad_norm": 1.3243509622284932e-09, "kl": 0.04803466796875, "learning_rate": 1.0848372585179552e-05, "loss": 0.0019, "num_tokens": 107885241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1578 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5263333333333333, "grad_norm": 2.8321101019912476e-09, "kl": 0.04473876953125, "learning_rate": 1.083677843332316e-05, "loss": 0.0018, "num_tokens": 107961081.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1579 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5266666666666666, "grad_norm": 1.0748488765699449e-09, "kl": 0.04559326171875, "learning_rate": 1.0825183148590055e-05, "loss": 0.0018, "num_tokens": 108035849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1580 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.527, "grad_norm": 1.0800532690424802e-09, "kl": 0.0411376953125, "learning_rate": 1.0813586746678584e-05, "loss": 0.0016, "num_tokens": 108113433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1581 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5273333333333333, "grad_norm": 2.0653483367993886e-09, "kl": 0.04840087890625, "learning_rate": 1.0801989243288588e-05, "loss": 0.0019, "num_tokens": 108190537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1582 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5276666666666666, "grad_norm": 2.1389061632959283e-09, "kl": 0.04339599609375, "learning_rate": 1.0790390654121414e-05, "loss": 0.0017, "num_tokens": 108266617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1583 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.528, "grad_norm": 1.8402083190593999e-09, "kl": 0.0478515625, "learning_rate": 1.077879099487986e-05, "loss": 0.0019, "num_tokens": 108343481.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1584 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5283333333333333, "grad_norm": 2.042461533235951e-09, "kl": 0.0423583984375, "learning_rate": 1.0767190281268187e-05, "loss": 0.0017, "num_tokens": 108419705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1585 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5286666666666666, "grad_norm": 1.421043394067567e-09, "kl": 0.04241943359375, "learning_rate": 1.0755588528992082e-05, "loss": 0.0017, "num_tokens": 108498633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1586 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.529, "grad_norm": 2.2432247170911523e-09, "kl": 0.04119873046875, "learning_rate": 1.0743985753758636e-05, "loss": 0.0016, "num_tokens": 108579465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1587 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5293333333333333, "grad_norm": 1.591195064598594e-09, "kl": 0.04156494140625, "learning_rate": 1.0732381971276318e-05, "loss": 0.0017, "num_tokens": 108655993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1588 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5296666666666666, "grad_norm": 1.6709698069661272e-09, "kl": 0.04559326171875, "learning_rate": 1.0720777197254974e-05, "loss": 0.0018, "num_tokens": 108730041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1589 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.53, "grad_norm": 1.7509981242724848e-09, "kl": 0.04241943359375, "learning_rate": 1.0709171447405786e-05, "loss": 0.0017, "num_tokens": 108809193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1590 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5303333333333333, "grad_norm": 1.7802064267158357e-09, "kl": 0.04638671875, "learning_rate": 1.0697564737441254e-05, "loss": 0.0019, "num_tokens": 108885529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1591 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5306666666666666, "grad_norm": 1.1593932480735702e-09, "kl": 0.0460205078125, "learning_rate": 1.0685957083075182e-05, "loss": 0.0018, "num_tokens": 108958921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1592 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.531, "grad_norm": 1.6403380875829043e-09, "kl": 0.046142578125, "learning_rate": 1.0674348500022653e-05, "loss": 0.0018, "num_tokens": 109033721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1593 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5313333333333333, "grad_norm": 3.140320004035857e-09, "kl": 0.0452880859375, "learning_rate": 1.0662739004000005e-05, "loss": 0.0018, "num_tokens": 109109273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1594 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5316666666666666, "grad_norm": 2.002084942276383e-09, "kl": 0.0445556640625, "learning_rate": 1.0651128610724808e-05, "loss": 0.0018, "num_tokens": 109188265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1595 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.532, "grad_norm": 1.3694785305773394e-09, "kl": 0.0467529296875, "learning_rate": 1.0639517335915857e-05, "loss": 0.0019, "num_tokens": 109263497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1596 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5323333333333333, "grad_norm": 2.0904051822867586e-09, "kl": 0.04437255859375, "learning_rate": 1.0627905195293135e-05, "loss": 0.0018, "num_tokens": 109340073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1597 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5326666666666666, "grad_norm": 1.9008772333961588e-09, "kl": 0.04156494140625, "learning_rate": 1.0616292204577796e-05, "loss": 0.0017, "num_tokens": 109422873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1598 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.533, "grad_norm": 1.6173647976458483e-09, "kl": 0.0447998046875, "learning_rate": 1.0604678379492143e-05, "loss": 0.0018, "num_tokens": 109497785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1599 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5333333333333333, "grad_norm": 1.5989894963652773e-09, "kl": 0.0478515625, "learning_rate": 1.0593063735759619e-05, "loss": 0.0019, "num_tokens": 109571801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1600 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5336666666666666, "grad_norm": 2.0446220272418714e-09, "kl": 0.0411376953125, "learning_rate": 1.0581448289104759e-05, "loss": 0.0016, "num_tokens": 109648329.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1601 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.534, "grad_norm": 2.5829700600610295e-09, "kl": 0.042724609375, "learning_rate": 1.05698320552532e-05, "loss": 0.0017, "num_tokens": 109726841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1602 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5343333333333333, "grad_norm": 2.544499722034743e-09, "kl": 0.0421142578125, "learning_rate": 1.055821504993164e-05, "loss": 0.0017, "num_tokens": 109806873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1603 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5346666666666666, "grad_norm": 1.5378008866306914e-09, "kl": 0.0489501953125, "learning_rate": 1.0546597288867815e-05, "loss": 0.002, "num_tokens": 109881977.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1604 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.535, "grad_norm": 2.2183384018603647e-09, "kl": 0.04217529296875, "learning_rate": 1.0534978787790494e-05, "loss": 0.0017, "num_tokens": 109962073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1605 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5353333333333333, "grad_norm": 1.3582489577501633e-09, "kl": 0.04730224609375, "learning_rate": 1.0523359562429441e-05, "loss": 0.0019, "num_tokens": 110035881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1606 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5356666666666666, "grad_norm": 1.8569572546311974e-09, "kl": 0.0443115234375, "learning_rate": 1.0511739628515402e-05, "loss": 0.0018, "num_tokens": 110112265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1607 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.536, "grad_norm": 1.3517668095985869e-09, "kl": 0.04412841796875, "learning_rate": 1.0500119001780085e-05, "loss": 0.0018, "num_tokens": 110187321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1608 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5363333333333333, "grad_norm": 1.730772858365981e-09, "kl": 0.0443115234375, "learning_rate": 1.0488497697956134e-05, "loss": 0.0018, "num_tokens": 110263097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1609 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5366666666666666, "grad_norm": 1.7241058580808044e-09, "kl": 0.0472412109375, "learning_rate": 1.047687573277711e-05, "loss": 0.0019, "num_tokens": 110339033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1610 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.537, "grad_norm": 1.2435815710531983e-09, "kl": 0.048583984375, "learning_rate": 1.046525312197747e-05, "loss": 0.0019, "num_tokens": 110413353.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1611 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5373333333333333, "grad_norm": 2.284982203448749e-09, "kl": 0.046142578125, "learning_rate": 1.0453629881292537e-05, "loss": 0.0018, "num_tokens": 110491769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1612 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5376666666666666, "grad_norm": 2.6038804445960295e-09, "kl": 0.0460205078125, "learning_rate": 1.0442006026458506e-05, "loss": 0.0018, "num_tokens": 110566393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1613 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.538, "grad_norm": 1.7037118382745575e-09, "kl": 0.04541015625, "learning_rate": 1.0430381573212385e-05, "loss": 0.0018, "num_tokens": 110642505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1614 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5383333333333333, "grad_norm": 3.1982430037658105e-09, "kl": 0.04730224609375, "learning_rate": 1.0418756537291996e-05, "loss": 0.0019, "num_tokens": 110719049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1615 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5386666666666666, "grad_norm": 1.6670455016409846e-09, "kl": 0.0474853515625, "learning_rate": 1.040713093443596e-05, "loss": 0.0019, "num_tokens": 110793641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1616 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.539, "grad_norm": 1.5777880113532206e-09, "kl": 0.0450439453125, "learning_rate": 1.0395504780383653e-05, "loss": 0.0018, "num_tokens": 110868873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1617 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5393333333333333, "grad_norm": 1.6516649159470376e-09, "kl": 0.044677734375, "learning_rate": 1.03838780908752e-05, "loss": 0.0018, "num_tokens": 110941993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1618 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5396666666666666, "grad_norm": 1.9598369593865073e-09, "kl": 0.046630859375, "learning_rate": 1.037225088165146e-05, "loss": 0.0019, "num_tokens": 111018313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1619 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.54, "grad_norm": 1.3101654205982527e-09, "kl": 0.04534912109375, "learning_rate": 1.0360623168453982e-05, "loss": 0.0018, "num_tokens": 111094585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1620 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5403333333333333, "grad_norm": 1.16604326194647e-09, "kl": 0.04608154296875, "learning_rate": 1.0348994967025012e-05, "loss": 0.0018, "num_tokens": 111169129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1621 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5406666666666666, "grad_norm": 1.2935215121245847e-09, "kl": 0.046875, "learning_rate": 1.0337366293107441e-05, "loss": 0.0019, "num_tokens": 111244009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1622 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.541, "grad_norm": 1.346747824371164e-09, "kl": 0.0489501953125, "learning_rate": 1.0325737162444813e-05, "loss": 0.002, "num_tokens": 111319961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1623 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5413333333333333, "grad_norm": 1.6689208903741815e-09, "kl": 0.04791259765625, "learning_rate": 1.0314107590781284e-05, "loss": 0.0019, "num_tokens": 111396745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1624 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5416666666666666, "grad_norm": 1.3106982166277703e-09, "kl": 0.04718017578125, "learning_rate": 1.0302477593861608e-05, "loss": 0.0019, "num_tokens": 111470713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1625 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.542, "grad_norm": 1.0043521569969016e-09, "kl": 0.04290771484375, "learning_rate": 1.0290847187431115e-05, "loss": 0.0017, "num_tokens": 111543417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1626 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5423333333333333, "grad_norm": 1.5507116701840573e-09, "kl": 0.04278564453125, "learning_rate": 1.0279216387235691e-05, "loss": 0.0017, "num_tokens": 111619385.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1627 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5426666666666666, "grad_norm": 1.8860577544188573e-09, "kl": 0.04736328125, "learning_rate": 1.0267585209021748e-05, "loss": 0.0019, "num_tokens": 111697401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1628 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.543, "grad_norm": 1.6815474568332434e-09, "kl": 0.04559326171875, "learning_rate": 1.0255953668536223e-05, "loss": 0.0018, "num_tokens": 111773385.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1629 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5433333333333333, "grad_norm": 2.135041476947208e-09, "kl": 0.0477294921875, "learning_rate": 1.0244321781526533e-05, "loss": 0.0019, "num_tokens": 111849833.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1630 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5436666666666666, "grad_norm": 1.212184019827589e-09, "kl": 0.045654296875, "learning_rate": 1.0232689563740563e-05, "loss": 0.0018, "num_tokens": 111924185.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1631 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.544, "grad_norm": 1.1712861791579599e-09, "kl": 0.046630859375, "learning_rate": 1.0221057030926657e-05, "loss": 0.0019, "num_tokens": 111999625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1632 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5443333333333333, "grad_norm": 1.0637560832194026e-09, "kl": 0.0472412109375, "learning_rate": 1.0209424198833571e-05, "loss": 0.0019, "num_tokens": 112073305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1633 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5446666666666666, "grad_norm": 1.7328191104226676e-09, "kl": 0.0465087890625, "learning_rate": 1.0197791083210478e-05, "loss": 0.0019, "num_tokens": 112150649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1634 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.545, "grad_norm": 1.9609580625967737e-09, "kl": 0.040557861328125, "learning_rate": 1.0186157699806928e-05, "loss": 0.0016, "num_tokens": 112226681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1635 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5453333333333333, "grad_norm": 1.1781242648112311e-09, "kl": 0.04388427734375, "learning_rate": 1.0174524064372837e-05, "loss": 0.0018, "num_tokens": 112300889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1636 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5456666666666666, "grad_norm": 1.7359098603009215e-09, "kl": 0.04547119140625, "learning_rate": 1.0162890192658459e-05, "loss": 0.0018, "num_tokens": 112376905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1637 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.546, "grad_norm": 3.3030778112674852e-09, "kl": 0.0445556640625, "learning_rate": 1.0151256100414375e-05, "loss": 0.0018, "num_tokens": 112454249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1638 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5463333333333333, "grad_norm": 2.270676535687244e-09, "kl": 0.043212890625, "learning_rate": 1.0139621803391454e-05, "loss": 0.0017, "num_tokens": 112529961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1639 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5466666666666666, "grad_norm": 2.0575796622068765e-09, "kl": 0.04351806640625, "learning_rate": 1.0127987317340851e-05, "loss": 0.0017, "num_tokens": 112608361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1640 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.547, "grad_norm": 1.5570634781525428e-09, "kl": 0.044189453125, "learning_rate": 1.0116352658013973e-05, "loss": 0.0018, "num_tokens": 112681593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1641 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5473333333333333, "grad_norm": 1.2765823953486688e-09, "kl": 0.0439453125, "learning_rate": 1.010471784116246e-05, "loss": 0.0018, "num_tokens": 112758265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1642 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5476666666666666, "grad_norm": 1.8057595418952133e-09, "kl": 0.047607421875, "learning_rate": 1.009308288253817e-05, "loss": 0.0019, "num_tokens": 112833545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1643 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.548, "grad_norm": 2.8282689523706495e-09, "kl": 0.0478515625, "learning_rate": 1.0081447797893149e-05, "loss": 0.0019, "num_tokens": 112909401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1644 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5483333333333333, "grad_norm": 2.4261828102112304e-09, "kl": 0.04705810546875, "learning_rate": 1.0069812602979617e-05, "loss": 0.0019, "num_tokens": 112987273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1645 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5486666666666666, "grad_norm": 1.696069062973038e-09, "kl": 0.04608154296875, "learning_rate": 1.005817731354994e-05, "loss": 0.0018, "num_tokens": 113061801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1646 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.549, "grad_norm": 2.684956257326121e-09, "kl": 0.0438232421875, "learning_rate": 1.0046541945356613e-05, "loss": 0.0018, "num_tokens": 113140345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1647 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5493333333333333, "grad_norm": 1.3716794367013563e-09, "kl": 0.04547119140625, "learning_rate": 1.0034906514152239e-05, "loss": 0.0018, "num_tokens": 113216153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1648 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5496666666666666, "grad_norm": 1.4945731319215838e-09, "kl": 0.0452880859375, "learning_rate": 1.0023271035689506e-05, "loss": 0.0018, "num_tokens": 113291289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1649 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.55, "grad_norm": 1.4617590471388553e-09, "kl": 0.04071044921875, "learning_rate": 1.001163552572116e-05, "loss": 0.0016, "num_tokens": 113367305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1650 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5503333333333333, "grad_norm": 2.175035263007885e-09, "kl": 0.042724609375, "learning_rate": 1e-05, "loss": 0.0017, "num_tokens": 113443529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1651 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5506666666666666, "grad_norm": 2.008985866552848e-09, "kl": 0.04693603515625, "learning_rate": 9.988364474278844e-06, "loss": 0.0019, "num_tokens": 113519529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1652 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.551, "grad_norm": 1.3901484408052056e-09, "kl": 0.04925537109375, "learning_rate": 9.976728964310499e-06, "loss": 0.002, "num_tokens": 113595225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1653 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5513333333333333, "grad_norm": 2.2641430952319297e-09, "kl": 0.0450439453125, "learning_rate": 9.965093485847766e-06, "loss": 0.0018, "num_tokens": 113670329.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1654 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5516666666666666, "grad_norm": 1.9120292016339135e-09, "kl": 0.0457763671875, "learning_rate": 9.953458054643389e-06, "loss": 0.0018, "num_tokens": 113747929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1655 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.552, "grad_norm": 1.233242730158679e-09, "kl": 0.0477294921875, "learning_rate": 9.941822686450061e-06, "loss": 0.0019, "num_tokens": 113820761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1656 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5523333333333333, "grad_norm": 3.649298863095396e-09, "kl": 0.048583984375, "learning_rate": 9.930187397020385e-06, "loss": 0.0019, "num_tokens": 113895401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1657 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5526666666666666, "grad_norm": 1.4731783570809398e-09, "kl": 0.04388427734375, "learning_rate": 9.918552202106853e-06, "loss": 0.0018, "num_tokens": 113969929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1658 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.553, "grad_norm": 1.9787802507664765e-09, "kl": 0.0460205078125, "learning_rate": 9.906917117461835e-06, "loss": 0.0018, "num_tokens": 114045129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1659 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5533333333333333, "grad_norm": 2.3373771806944887e-09, "kl": 0.0440673828125, "learning_rate": 9.895282158837545e-06, "loss": 0.0018, "num_tokens": 114121625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1660 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5536666666666666, "grad_norm": 3.3639102614557714e-09, "kl": 0.0445556640625, "learning_rate": 9.883647341986032e-06, "loss": 0.0018, "num_tokens": 114198777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1661 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.554, "grad_norm": 1.687276873774124e-09, "kl": 0.04205322265625, "learning_rate": 9.87201268265915e-06, "loss": 0.0017, "num_tokens": 114275129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1662 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5543333333333333, "grad_norm": 1.5919796592100965e-09, "kl": 0.04339599609375, "learning_rate": 9.860378196608549e-06, "loss": 0.0017, "num_tokens": 114351721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1663 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5546666666666666, "grad_norm": 1.410501271337239e-09, "kl": 0.04559326171875, "learning_rate": 9.848743899585628e-06, "loss": 0.0018, "num_tokens": 114424537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1664 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.555, "grad_norm": 2.9710793825188375e-09, "kl": 0.04156494140625, "learning_rate": 9.837109807341543e-06, "loss": 0.0017, "num_tokens": 114505321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1665 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5553333333333333, "grad_norm": 1.39505929031003e-09, "kl": 0.049072265625, "learning_rate": 9.825475935627165e-06, "loss": 0.002, "num_tokens": 114581977.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1666 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5556666666666666, "grad_norm": 1.3482273075737794e-09, "kl": 0.04498291015625, "learning_rate": 9.813842300193077e-06, "loss": 0.0018, "num_tokens": 114656361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1667 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.556, "grad_norm": 1.4014948090945722e-09, "kl": 0.04058837890625, "learning_rate": 9.802208916789528e-06, "loss": 0.0016, "num_tokens": 114732121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1668 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5563333333333333, "grad_norm": 1.881304223516622e-09, "kl": 0.04522705078125, "learning_rate": 9.790575801166432e-06, "loss": 0.0018, "num_tokens": 114806873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1669 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5566666666666666, "grad_norm": 1.7991323986166208e-09, "kl": 0.0462646484375, "learning_rate": 9.778942969073345e-06, "loss": 0.0019, "num_tokens": 114882825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1670 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.557, "grad_norm": 2.1693273843936822e-09, "kl": 0.04815673828125, "learning_rate": 9.767310436259438e-06, "loss": 0.0019, "num_tokens": 114958089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1671 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5573333333333333, "grad_norm": 2.684920952233938e-09, "kl": 0.04547119140625, "learning_rate": 9.75567821847347e-06, "loss": 0.0018, "num_tokens": 115035321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1672 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5576666666666666, "grad_norm": 1.7691628162452844e-09, "kl": 0.04571533203125, "learning_rate": 9.74404633146378e-06, "loss": 0.0018, "num_tokens": 115111257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1673 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.558, "grad_norm": 1.706522811950606e-09, "kl": 0.04486083984375, "learning_rate": 9.732414790978253e-06, "loss": 0.0018, "num_tokens": 115186505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5583333333333333, "grad_norm": 1.8742893903578306e-09, "kl": 0.04315185546875, "learning_rate": 9.720783612764314e-06, "loss": 0.0017, "num_tokens": 115264025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1675 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5586666666666666, "grad_norm": 1.583735365073835e-09, "kl": 0.04473876953125, "learning_rate": 9.709152812568886e-06, "loss": 0.0018, "num_tokens": 115342953.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1676 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.559, "grad_norm": 2.468940385469409e-09, "kl": 0.04864501953125, "learning_rate": 9.697522406138395e-06, "loss": 0.0019, "num_tokens": 115418441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1677 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5593333333333333, "grad_norm": 2.229818107934989e-09, "kl": 0.0491943359375, "learning_rate": 9.685892409218718e-06, "loss": 0.002, "num_tokens": 115495177.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1678 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5596666666666666, "grad_norm": 1.401665672418062e-09, "kl": 0.04278564453125, "learning_rate": 9.67426283755519e-06, "loss": 0.0017, "num_tokens": 115569433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1679 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.56, "grad_norm": 2.023365475167793e-09, "kl": 0.0465087890625, "learning_rate": 9.66263370689256e-06, "loss": 0.0019, "num_tokens": 115646969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1680 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5603333333333333, "grad_norm": 1.4352715682619532e-09, "kl": 0.0433349609375, "learning_rate": 9.651005032974994e-06, "loss": 0.0017, "num_tokens": 115722665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1681 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5606666666666666, "grad_norm": 1.5592030999656004e-09, "kl": 0.0430908203125, "learning_rate": 9.639376831546018e-06, "loss": 0.0017, "num_tokens": 115798297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1682 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.561, "grad_norm": 2.4742110582565147e-09, "kl": 0.04803466796875, "learning_rate": 9.627749118348541e-06, "loss": 0.0019, "num_tokens": 115876873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1683 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5613333333333334, "grad_norm": 1.3518681729607351e-09, "kl": 0.04620361328125, "learning_rate": 9.616121909124801e-06, "loss": 0.0018, "num_tokens": 115953561.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1684 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5616666666666666, "grad_norm": 1.7428610776804021e-09, "kl": 0.045654296875, "learning_rate": 9.60449521961635e-06, "loss": 0.0018, "num_tokens": 116029593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1685 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.562, "grad_norm": 1.9048589372516744e-09, "kl": 0.0474853515625, "learning_rate": 9.592869065564043e-06, "loss": 0.0019, "num_tokens": 116104729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1686 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5623333333333334, "grad_norm": 2.7315361084134793e-09, "kl": 0.04876708984375, "learning_rate": 9.581243462708007e-06, "loss": 0.0019, "num_tokens": 116180073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1687 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5626666666666666, "grad_norm": 1.786708447859553e-09, "kl": 0.04278564453125, "learning_rate": 9.56961842678762e-06, "loss": 0.0017, "num_tokens": 116256553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1688 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.563, "grad_norm": 2.1482200462941137e-09, "kl": 0.04547119140625, "learning_rate": 9.557993973541494e-06, "loss": 0.0018, "num_tokens": 116332073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1689 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5633333333333334, "grad_norm": 1.6395352853137979e-09, "kl": 0.04632568359375, "learning_rate": 9.546370118707463e-06, "loss": 0.0019, "num_tokens": 116407705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1690 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5636666666666666, "grad_norm": 1.256568626928356e-09, "kl": 0.04833984375, "learning_rate": 9.534746878022533e-06, "loss": 0.0019, "num_tokens": 116482873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1691 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.564, "grad_norm": 1.0780000225807385e-09, "kl": 0.047119140625, "learning_rate": 9.523124267222894e-06, "loss": 0.0019, "num_tokens": 116558297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1692 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5643333333333334, "grad_norm": 1.355680456782693e-09, "kl": 0.04229736328125, "learning_rate": 9.511502302043867e-06, "loss": 0.0017, "num_tokens": 116633849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1693 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5646666666666667, "grad_norm": 2.220461370328053e-09, "kl": 0.046142578125, "learning_rate": 9.49988099821992e-06, "loss": 0.0018, "num_tokens": 116710953.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1694 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.565, "grad_norm": 1.2572445307057478e-09, "kl": 0.04290771484375, "learning_rate": 9.488260371484603e-06, "loss": 0.0017, "num_tokens": 116786665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1695 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5653333333333334, "grad_norm": 1.6089199972313395e-09, "kl": 0.0404052734375, "learning_rate": 9.476640437570562e-06, "loss": 0.0016, "num_tokens": 116863657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1696 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5656666666666667, "grad_norm": 1.5192305191646938e-09, "kl": 0.04718017578125, "learning_rate": 9.465021212209508e-06, "loss": 0.0019, "num_tokens": 116941193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1697 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.566, "grad_norm": 1.9401311668332255e-09, "kl": 0.04638671875, "learning_rate": 9.453402711132188e-06, "loss": 0.0019, "num_tokens": 117017929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1698 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5663333333333334, "grad_norm": 1.5091872196393297e-09, "kl": 0.0450439453125, "learning_rate": 9.441784950068362e-06, "loss": 0.0018, "num_tokens": 117093241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1699 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5666666666666667, "grad_norm": 1.7722497913652546e-09, "kl": 0.04705810546875, "learning_rate": 9.430167944746802e-06, "loss": 0.0019, "num_tokens": 117168201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1700 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.567, "grad_norm": 1.4675030080013585e-09, "kl": 0.04595947265625, "learning_rate": 9.418551710895243e-06, "loss": 0.0018, "num_tokens": 117243913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1701 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5673333333333334, "grad_norm": 2.2495594276250586e-09, "kl": 0.0452880859375, "learning_rate": 9.406936264240386e-06, "loss": 0.0018, "num_tokens": 117323785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1702 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5676666666666667, "grad_norm": 1.7817670672215513e-09, "kl": 0.05072021484375, "learning_rate": 9.395321620507857e-06, "loss": 0.002, "num_tokens": 117398953.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1703 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.568, "grad_norm": 1.2746972366528553e-09, "kl": 0.0411376953125, "learning_rate": 9.383707795422207e-06, "loss": 0.0016, "num_tokens": 117473657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1704 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5683333333333334, "grad_norm": 2.6153930132721825e-09, "kl": 0.0469970703125, "learning_rate": 9.372094804706867e-06, "loss": 0.0019, "num_tokens": 117553193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1705 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5686666666666667, "grad_norm": 1.7244937700056084e-09, "kl": 0.045654296875, "learning_rate": 9.360482664084144e-06, "loss": 0.0018, "num_tokens": 117626825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1706 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.569, "grad_norm": 1.7249683903486357e-09, "kl": 0.04754638671875, "learning_rate": 9.348871389275194e-06, "loss": 0.0019, "num_tokens": 117701561.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1707 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5693333333333334, "grad_norm": 1.752768485907552e-09, "kl": 0.043701171875, "learning_rate": 9.337260996000002e-06, "loss": 0.0017, "num_tokens": 117778969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1708 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5696666666666667, "grad_norm": 1.8492066766739867e-09, "kl": 0.0479736328125, "learning_rate": 9.32565149997735e-06, "loss": 0.0019, "num_tokens": 117855001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1709 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.57, "grad_norm": 9.798014621864581e-10, "kl": 0.041748046875, "learning_rate": 9.314042916924816e-06, "loss": 0.0017, "num_tokens": 117928521.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1710 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5703333333333334, "grad_norm": 1.470273680581613e-09, "kl": 0.04510498046875, "learning_rate": 9.302435262558748e-06, "loss": 0.0018, "num_tokens": 118010297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1711 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5706666666666667, "grad_norm": 7.561631321095774e-10, "kl": 0.0401611328125, "learning_rate": 9.290828552594218e-06, "loss": 0.0016, "num_tokens": 118085657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1712 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.571, "grad_norm": 1.917776604187793e-09, "kl": 0.04620361328125, "learning_rate": 9.279222802745028e-06, "loss": 0.0018, "num_tokens": 118162361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1713 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5713333333333334, "grad_norm": 1.3652730057600593e-09, "kl": 0.0450439453125, "learning_rate": 9.267618028723687e-06, "loss": 0.0018, "num_tokens": 118236521.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1714 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5716666666666667, "grad_norm": 2.026567580415417e-09, "kl": 0.0458984375, "learning_rate": 9.256014246241369e-06, "loss": 0.0018, "num_tokens": 118313593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1715 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.572, "grad_norm": 2.4780502094756685e-09, "kl": 0.04376220703125, "learning_rate": 9.244411471007923e-06, "loss": 0.0017, "num_tokens": 118390665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1716 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5723333333333334, "grad_norm": 2.0237731490624356e-09, "kl": 0.045166015625, "learning_rate": 9.232809718731815e-06, "loss": 0.0018, "num_tokens": 118467769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1717 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5726666666666667, "grad_norm": 3.4158820216845243e-09, "kl": 0.044677734375, "learning_rate": 9.221209005120142e-06, "loss": 0.0018, "num_tokens": 118544777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1718 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.573, "grad_norm": 1.8960197856188188e-09, "kl": 0.04254150390625, "learning_rate": 9.20960934587859e-06, "loss": 0.0017, "num_tokens": 118618729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1719 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5733333333333334, "grad_norm": 2.2596837734312203e-09, "kl": 0.0474853515625, "learning_rate": 9.198010756711413e-06, "loss": 0.0019, "num_tokens": 118694729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1720 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5736666666666667, "grad_norm": 1.983386788140251e-09, "kl": 0.04742431640625, "learning_rate": 9.18641325332142e-06, "loss": 0.0019, "num_tokens": 118770505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1721 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.574, "grad_norm": 2.21807083811143e-09, "kl": 0.04571533203125, "learning_rate": 9.174816851409949e-06, "loss": 0.0018, "num_tokens": 118847865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1722 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5743333333333334, "grad_norm": 1.524444015466031e-09, "kl": 0.04681396484375, "learning_rate": 9.163221566676847e-06, "loss": 0.0019, "num_tokens": 118922073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1723 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5746666666666667, "grad_norm": 2.259560760720092e-09, "kl": 0.0443115234375, "learning_rate": 9.151627414820448e-06, "loss": 0.0018, "num_tokens": 118999673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1724 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.575, "grad_norm": 1.4119426738901097e-09, "kl": 0.04541015625, "learning_rate": 9.140034411537558e-06, "loss": 0.0018, "num_tokens": 119074793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1725 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5753333333333334, "grad_norm": 1.0345949652545983e-09, "kl": 0.04437255859375, "learning_rate": 9.128442572523418e-06, "loss": 0.0018, "num_tokens": 119149401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1726 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5756666666666667, "grad_norm": 2.1159862750863567e-09, "kl": 0.04351806640625, "learning_rate": 9.116851913471701e-06, "loss": 0.0017, "num_tokens": 119226537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1727 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.576, "grad_norm": 1.1043195247140147e-09, "kl": 0.04815673828125, "learning_rate": 9.105262450074479e-06, "loss": 0.0019, "num_tokens": 119300297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1728 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5763333333333334, "grad_norm": 3.860205044503573e-09, "kl": 0.04339599609375, "learning_rate": 9.093674198022201e-06, "loss": 0.0017, "num_tokens": 119378281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1729 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5766666666666667, "grad_norm": 1.656429327034914e-09, "kl": 0.04425048828125, "learning_rate": 9.082087173003686e-06, "loss": 0.0018, "num_tokens": 119457481.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1730 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.577, "grad_norm": 1.4837566730818708e-09, "kl": 0.04608154296875, "learning_rate": 9.07050139070608e-06, "loss": 0.0018, "num_tokens": 119533849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1731 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5773333333333334, "grad_norm": 2.0708159631510625e-09, "kl": 0.04998779296875, "learning_rate": 9.058916866814857e-06, "loss": 0.002, "num_tokens": 119611977.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1732 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5776666666666667, "grad_norm": 1.5584373791455164e-09, "kl": 0.0472412109375, "learning_rate": 9.047333617013786e-06, "loss": 0.0019, "num_tokens": 119687721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1733 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.578, "grad_norm": 1.3733874038024396e-09, "kl": 0.0438232421875, "learning_rate": 9.035751656984904e-06, "loss": 0.0018, "num_tokens": 119762457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1734 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5783333333333334, "grad_norm": 1.005605265724796e-09, "kl": 0.0450439453125, "learning_rate": 9.024171002408507e-06, "loss": 0.0018, "num_tokens": 119836329.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1735 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5786666666666667, "grad_norm": 1.3734121617758888e-09, "kl": 0.044921875, "learning_rate": 9.012591668963123e-06, "loss": 0.0018, "num_tokens": 119911961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1736 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.579, "grad_norm": 1.5128538422004567e-09, "kl": 0.0447998046875, "learning_rate": 9.001013672325491e-06, "loss": 0.0018, "num_tokens": 119987497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1737 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5793333333333334, "grad_norm": 1.5829820787516269e-09, "kl": 0.04693603515625, "learning_rate": 8.989437028170537e-06, "loss": 0.0019, "num_tokens": 120061929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1738 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5796666666666667, "grad_norm": 1.4337884213233565e-09, "kl": 0.044677734375, "learning_rate": 8.977861752171365e-06, "loss": 0.0018, "num_tokens": 120136617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1739 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.58, "grad_norm": 1.5615622128706264e-09, "kl": 0.046630859375, "learning_rate": 8.966287859999216e-06, "loss": 0.0019, "num_tokens": 120212265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1740 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5803333333333334, "grad_norm": 2.503499185735336e-09, "kl": 0.046630859375, "learning_rate": 8.954715367323468e-06, "loss": 0.0019, "num_tokens": 120291945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1741 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5806666666666667, "grad_norm": 1.8234164178565493e-09, "kl": 0.04229736328125, "learning_rate": 8.94314428981159e-06, "loss": 0.0017, "num_tokens": 120367161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1742 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.581, "grad_norm": 1.9290238295610607e-09, "kl": 0.0447998046875, "learning_rate": 8.931574643129152e-06, "loss": 0.0018, "num_tokens": 120443449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1743 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5813333333333334, "grad_norm": 2.931271003703273e-09, "kl": 0.04437255859375, "learning_rate": 8.920006442939772e-06, "loss": 0.0018, "num_tokens": 120520713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1744 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5816666666666667, "grad_norm": 2.500268436733677e-09, "kl": 0.047119140625, "learning_rate": 8.90843970490512e-06, "loss": 0.0019, "num_tokens": 120596841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1745 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.582, "grad_norm": 1.2232546087176388e-09, "kl": 0.04705810546875, "learning_rate": 8.896874444684882e-06, "loss": 0.0019, "num_tokens": 120671241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1746 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5823333333333334, "grad_norm": 2.2328690008066587e-09, "kl": 0.04608154296875, "learning_rate": 8.885310677936746e-06, "loss": 0.0018, "num_tokens": 120747785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1747 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5826666666666667, "grad_norm": 1.1094360985453022e-09, "kl": 0.0472412109375, "learning_rate": 8.873748420316372e-06, "loss": 0.0019, "num_tokens": 120822409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1748 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.583, "grad_norm": 1.3191042702587197e-09, "kl": 0.045654296875, "learning_rate": 8.862187687477386e-06, "loss": 0.0018, "num_tokens": 120896297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1749 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5833333333333334, "grad_norm": 2.8086502013024983e-09, "kl": 0.046875, "learning_rate": 8.850628495071336e-06, "loss": 0.0019, "num_tokens": 120972905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1750 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5836666666666667, "grad_norm": 1.377590375106763e-09, "kl": 0.0452880859375, "learning_rate": 8.839070858747697e-06, "loss": 0.0018, "num_tokens": 121047689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1751 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.584, "grad_norm": 2.4442166068894267e-09, "kl": 0.04412841796875, "learning_rate": 8.827514794153839e-06, "loss": 0.0018, "num_tokens": 121124201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1752 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5843333333333334, "grad_norm": 2.2935537913326698e-09, "kl": 0.0465087890625, "learning_rate": 8.815960316934991e-06, "loss": 0.0019, "num_tokens": 121200569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1753 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5846666666666667, "grad_norm": 1.6534378310950615e-09, "kl": 0.0440673828125, "learning_rate": 8.804407442734244e-06, "loss": 0.0018, "num_tokens": 121275033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1754 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.585, "grad_norm": 1.8148305080956106e-09, "kl": 0.0491943359375, "learning_rate": 8.792856187192516e-06, "loss": 0.002, "num_tokens": 121351513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1755 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5853333333333334, "grad_norm": 1.8299756154860347e-09, "kl": 0.04730224609375, "learning_rate": 8.781306565948528e-06, "loss": 0.0019, "num_tokens": 121431913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1756 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5856666666666667, "grad_norm": 1.563575824370389e-09, "kl": 0.04473876953125, "learning_rate": 8.769758594638796e-06, "loss": 0.0018, "num_tokens": 121507049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1757 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.586, "grad_norm": 1.4938239534245668e-09, "kl": 0.0465087890625, "learning_rate": 8.758212288897597e-06, "loss": 0.0019, "num_tokens": 121581257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1758 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5863333333333334, "grad_norm": 1.427981954904567e-09, "kl": 0.04595947265625, "learning_rate": 8.746667664356957e-06, "loss": 0.0018, "num_tokens": 121655081.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1759 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5866666666666667, "grad_norm": 1.3076499882913595e-09, "kl": 0.04791259765625, "learning_rate": 8.735124736646627e-06, "loss": 0.0019, "num_tokens": 121729449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1760 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.587, "grad_norm": 2.683783417722907e-09, "kl": 0.04833984375, "learning_rate": 8.723583521394054e-06, "loss": 0.0019, "num_tokens": 121806617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1761 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5873333333333334, "grad_norm": 1.2446469410676286e-09, "kl": 0.04498291015625, "learning_rate": 8.712044034224374e-06, "loss": 0.0018, "num_tokens": 121885305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1762 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5876666666666667, "grad_norm": 2.503556251198802e-09, "kl": 0.043701171875, "learning_rate": 8.700506290760377e-06, "loss": 0.0017, "num_tokens": 121966905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1763 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.588, "grad_norm": 2.325371895040007e-09, "kl": 0.04705810546875, "learning_rate": 8.688970306622494e-06, "loss": 0.0019, "num_tokens": 122043385.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1764 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5883333333333334, "grad_norm": 1.0238431213949184e-09, "kl": 0.04150390625, "learning_rate": 8.677436097428775e-06, "loss": 0.0017, "num_tokens": 122118601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1765 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5886666666666667, "grad_norm": 2.0231400998937943e-09, "kl": 0.04132080078125, "learning_rate": 8.665903678794873e-06, "loss": 0.0017, "num_tokens": 122194249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1766 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.589, "grad_norm": 1.476762934160547e-09, "kl": 0.0467529296875, "learning_rate": 8.654373066334007e-06, "loss": 0.0019, "num_tokens": 122269257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1767 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5893333333333334, "grad_norm": 1.092842816241557e-09, "kl": 0.04498291015625, "learning_rate": 8.642844275656957e-06, "loss": 0.0018, "num_tokens": 122343177.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1768 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5896666666666667, "grad_norm": 1.4508390044909447e-09, "kl": 0.04534912109375, "learning_rate": 8.631317322372032e-06, "loss": 0.0018, "num_tokens": 122418233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1769 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.59, "grad_norm": 2.323455650099504e-09, "kl": 0.04827880859375, "learning_rate": 8.619792222085059e-06, "loss": 0.0019, "num_tokens": 122497529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1770 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5903333333333334, "grad_norm": 3.2626954471481895e-09, "kl": 0.04376220703125, "learning_rate": 8.60826899039935e-06, "loss": 0.0018, "num_tokens": 122575161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1771 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5906666666666667, "grad_norm": 1.5555665644484407e-09, "kl": 0.04730224609375, "learning_rate": 8.596747642915687e-06, "loss": 0.0019, "num_tokens": 122649737.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1772 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.591, "grad_norm": 3.3488369854950406e-09, "kl": 0.0482177734375, "learning_rate": 8.585228195232311e-06, "loss": 0.0019, "num_tokens": 122730745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1773 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5913333333333334, "grad_norm": 1.7979397970435684e-09, "kl": 0.04443359375, "learning_rate": 8.573710662944884e-06, "loss": 0.0018, "num_tokens": 122806905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1774 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5916666666666667, "grad_norm": 1.7654516737408699e-09, "kl": 0.04681396484375, "learning_rate": 8.562195061646474e-06, "loss": 0.0019, "num_tokens": 122881913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1775 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.592, "grad_norm": 1.456313403203069e-09, "kl": 0.04364013671875, "learning_rate": 8.550681406927534e-06, "loss": 0.0017, "num_tokens": 122960329.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1776 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5923333333333334, "grad_norm": 2.5032791395318554e-09, "kl": 0.04376220703125, "learning_rate": 8.539169714375885e-06, "loss": 0.0018, "num_tokens": 123038377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1777 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5926666666666667, "grad_norm": 1.246847958213948e-09, "kl": 0.04296875, "learning_rate": 8.527659999576692e-06, "loss": 0.0017, "num_tokens": 123111401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1778 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.593, "grad_norm": 1.3495038420074934e-09, "kl": 0.04522705078125, "learning_rate": 8.516152278112433e-06, "loss": 0.0018, "num_tokens": 123184489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1779 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5933333333333334, "grad_norm": 1.646224490059467e-09, "kl": 0.0428466796875, "learning_rate": 8.504646565562907e-06, "loss": 0.0017, "num_tokens": 123259769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1780 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5936666666666667, "grad_norm": 1.183572906349184e-09, "kl": 0.04351806640625, "learning_rate": 8.49314287750517e-06, "loss": 0.0017, "num_tokens": 123333769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1781 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.594, "grad_norm": 1.576472952180552e-09, "kl": 0.0496826171875, "learning_rate": 8.481641229513554e-06, "loss": 0.002, "num_tokens": 123409081.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1782 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5943333333333334, "grad_norm": 1.1496690266454834e-09, "kl": 0.045166015625, "learning_rate": 8.47014163715962e-06, "loss": 0.0018, "num_tokens": 123484185.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1783 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5946666666666667, "grad_norm": 1.5892974714049046e-09, "kl": 0.04510498046875, "learning_rate": 8.458644116012154e-06, "loss": 0.0018, "num_tokens": 123560585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1784 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.595, "grad_norm": 9.45226785731279e-10, "kl": 0.0469970703125, "learning_rate": 8.447148681637127e-06, "loss": 0.0019, "num_tokens": 123638345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1785 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5953333333333334, "grad_norm": 1.885247069566276e-09, "kl": 0.04534912109375, "learning_rate": 8.43565534959769e-06, "loss": 0.0018, "num_tokens": 123717977.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1786 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5956666666666667, "grad_norm": 3.654695213128889e-09, "kl": 0.04449462890625, "learning_rate": 8.424164135454158e-06, "loss": 0.0018, "num_tokens": 123796473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1787 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.596, "grad_norm": 1.504958047071625e-09, "kl": 0.04742431640625, "learning_rate": 8.412675054763963e-06, "loss": 0.0019, "num_tokens": 123872249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1788 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5963333333333334, "grad_norm": 1.5198285963080593e-09, "kl": 0.0443115234375, "learning_rate": 8.401188123081653e-06, "loss": 0.0018, "num_tokens": 123948313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1789 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5966666666666667, "grad_norm": 1.829009499410006e-09, "kl": 0.04327392578125, "learning_rate": 8.389703355958873e-06, "loss": 0.0017, "num_tokens": 124025497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1790 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.597, "grad_norm": 1.350954570433771e-09, "kl": 0.04266357421875, "learning_rate": 8.378220768944328e-06, "loss": 0.0017, "num_tokens": 124101001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1791 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5973333333333334, "grad_norm": 1.2985260644526875e-09, "kl": 0.0472412109375, "learning_rate": 8.366740377583781e-06, "loss": 0.0019, "num_tokens": 124176025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1792 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5976666666666667, "grad_norm": 1.8538298673931308e-09, "kl": 0.04705810546875, "learning_rate": 8.355262197420011e-06, "loss": 0.0019, "num_tokens": 124252649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1793 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.598, "grad_norm": 1.965329010644723e-09, "kl": 0.044921875, "learning_rate": 8.343786243992819e-06, "loss": 0.0018, "num_tokens": 124330009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1794 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5983333333333334, "grad_norm": 1.3647816210493602e-09, "kl": 0.047607421875, "learning_rate": 8.332312532838978e-06, "loss": 0.0019, "num_tokens": 124404713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1795 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5986666666666667, "grad_norm": 1.900713586522329e-09, "kl": 0.04595947265625, "learning_rate": 8.32084107949223e-06, "loss": 0.0018, "num_tokens": 124481417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1796 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.599, "grad_norm": 1.3123166997530689e-09, "kl": 0.04302978515625, "learning_rate": 8.309371899483261e-06, "loss": 0.0017, "num_tokens": 124559721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1797 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5993333333333334, "grad_norm": 1.8150818625883858e-09, "kl": 0.04449462890625, "learning_rate": 8.297905008339677e-06, "loss": 0.0018, "num_tokens": 124639881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1798 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5996666666666667, "grad_norm": 2.8555537934238373e-09, "kl": 0.04437255859375, "learning_rate": 8.286440421585986e-06, "loss": 0.0018, "num_tokens": 124717641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1799 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6, "grad_norm": 1.5675584164043244e-09, "kl": 0.04486083984375, "learning_rate": 8.274978154743574e-06, "loss": 0.0018, "num_tokens": 124793545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1800 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6003333333333334, "grad_norm": 1.7557897358244645e-09, "kl": 0.0474853515625, "learning_rate": 8.263518223330698e-06, "loss": 0.0019, "num_tokens": 124866665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1801 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6006666666666667, "grad_norm": 1.0279787021616471e-09, "kl": 0.04608154296875, "learning_rate": 8.252060642862436e-06, "loss": 0.0018, "num_tokens": 124941545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1802 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.601, "grad_norm": 1.3686799471557265e-09, "kl": 0.04156494140625, "learning_rate": 8.240605428850693e-06, "loss": 0.0017, "num_tokens": 125015417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1803 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6013333333333334, "grad_norm": 2.2302211188929277e-09, "kl": 0.04486083984375, "learning_rate": 8.22915259680417e-06, "loss": 0.0018, "num_tokens": 125093321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1804 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6016666666666667, "grad_norm": 1.1721226211847124e-09, "kl": 0.0477294921875, "learning_rate": 8.217702162228337e-06, "loss": 0.0019, "num_tokens": 125167913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1805 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.602, "grad_norm": 1.4239555090611589e-09, "kl": 0.044921875, "learning_rate": 8.206254140625425e-06, "loss": 0.0018, "num_tokens": 125241913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1806 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6023333333333334, "grad_norm": 1.7920175343633105e-09, "kl": 0.04705810546875, "learning_rate": 8.194808547494401e-06, "loss": 0.0019, "num_tokens": 125319049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1807 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6026666666666667, "grad_norm": 1.508231206592825e-09, "kl": 0.04571533203125, "learning_rate": 8.183365398330931e-06, "loss": 0.0018, "num_tokens": 125394249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1808 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.603, "grad_norm": 1.424103390768039e-09, "kl": 0.04217529296875, "learning_rate": 8.171924708627387e-06, "loss": 0.0017, "num_tokens": 125468841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1809 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6033333333333334, "grad_norm": 1.5848075074487156e-09, "kl": 0.0433349609375, "learning_rate": 8.1604864938728e-06, "loss": 0.0017, "num_tokens": 125545161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1810 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6036666666666667, "grad_norm": 1.7607113544926278e-09, "kl": 0.04730224609375, "learning_rate": 8.149050769552856e-06, "loss": 0.0019, "num_tokens": 125622601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1811 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.604, "grad_norm": 1.8087880082617858e-09, "kl": 0.04443359375, "learning_rate": 8.137617551149868e-06, "loss": 0.0018, "num_tokens": 125697353.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1812 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6043333333333333, "grad_norm": 1.837575092089594e-09, "kl": 0.0455322265625, "learning_rate": 8.126186854142752e-06, "loss": 0.0018, "num_tokens": 125772217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1813 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6046666666666667, "grad_norm": 1.5220485982680998e-09, "kl": 0.04681396484375, "learning_rate": 8.114758694007025e-06, "loss": 0.0019, "num_tokens": 125846761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1814 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.605, "grad_norm": 1.5049265167377257e-09, "kl": 0.04681396484375, "learning_rate": 8.103333086214753e-06, "loss": 0.0019, "num_tokens": 125921257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1815 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6053333333333333, "grad_norm": 2.5190234342886697e-09, "kl": 0.040771484375, "learning_rate": 8.091910046234552e-06, "loss": 0.0016, "num_tokens": 125997433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1816 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6056666666666667, "grad_norm": 2.1337900335538507e-09, "kl": 0.04290771484375, "learning_rate": 8.080489589531567e-06, "loss": 0.0017, "num_tokens": 126072617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1817 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.606, "grad_norm": 1.868444732266994e-09, "kl": 0.0439453125, "learning_rate": 8.069071731567435e-06, "loss": 0.0018, "num_tokens": 126148425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1818 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6063333333333333, "grad_norm": 1.7837000765297262e-09, "kl": 0.04864501953125, "learning_rate": 8.057656487800283e-06, "loss": 0.0019, "num_tokens": 126224105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1819 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6066666666666667, "grad_norm": 3.08671643800551e-09, "kl": 0.04498291015625, "learning_rate": 8.046243873684694e-06, "loss": 0.0018, "num_tokens": 126303673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1820 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.607, "grad_norm": 1.7429819809677838e-09, "kl": 0.0474853515625, "learning_rate": 8.034833904671698e-06, "loss": 0.0019, "num_tokens": 126379961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1821 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6073333333333333, "grad_norm": 1.6735277608148635e-09, "kl": 0.04669189453125, "learning_rate": 8.023426596208739e-06, "loss": 0.0019, "num_tokens": 126455833.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1822 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6076666666666667, "grad_norm": 1.5251878648925299e-09, "kl": 0.040283203125, "learning_rate": 8.012021963739659e-06, "loss": 0.0016, "num_tokens": 126531689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1823 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.608, "grad_norm": 1.3055631020719716e-09, "kl": 0.044189453125, "learning_rate": 8.00062002270467e-06, "loss": 0.0018, "num_tokens": 126606649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1824 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6083333333333333, "grad_norm": 1.7147958608632052e-09, "kl": 0.0455322265625, "learning_rate": 7.989220788540356e-06, "loss": 0.0018, "num_tokens": 126682169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1825 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6086666666666667, "grad_norm": 3.0053066701896114e-09, "kl": 0.04681396484375, "learning_rate": 7.977824276679623e-06, "loss": 0.0019, "num_tokens": 126759289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1826 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.609, "grad_norm": 2.357055217672155e-09, "kl": 0.0479736328125, "learning_rate": 7.966430502551694e-06, "loss": 0.0019, "num_tokens": 126834169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1827 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6093333333333333, "grad_norm": 1.5345462678340027e-09, "kl": 0.04608154296875, "learning_rate": 7.955039481582098e-06, "loss": 0.0018, "num_tokens": 126907161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1828 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6096666666666667, "grad_norm": 2.952116107124425e-09, "kl": 0.04644775390625, "learning_rate": 7.943651229192615e-06, "loss": 0.0019, "num_tokens": 126981897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1829 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.61, "grad_norm": 1.2255934045413142e-09, "kl": 0.04754638671875, "learning_rate": 7.932265760801295e-06, "loss": 0.0019, "num_tokens": 127057625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1830 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6103333333333333, "grad_norm": 2.1801340732707786e-09, "kl": 0.0469970703125, "learning_rate": 7.92088309182241e-06, "loss": 0.0019, "num_tokens": 127132857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1831 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6106666666666667, "grad_norm": 1.4415745264173552e-09, "kl": 0.0418701171875, "learning_rate": 7.90950323766644e-06, "loss": 0.0017, "num_tokens": 127210681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1832 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.611, "grad_norm": 1.4148440197203627e-09, "kl": 0.0462646484375, "learning_rate": 7.898126213740063e-06, "loss": 0.0018, "num_tokens": 127285913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1833 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6113333333333333, "grad_norm": 1.5988915746945054e-09, "kl": 0.0452880859375, "learning_rate": 7.886752035446116e-06, "loss": 0.0018, "num_tokens": 127361609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1834 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6116666666666667, "grad_norm": 1.6123699042580597e-09, "kl": 0.04473876953125, "learning_rate": 7.875380718183589e-06, "loss": 0.0018, "num_tokens": 127436057.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1835 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.612, "grad_norm": 1.6639797317807847e-09, "kl": 0.0445556640625, "learning_rate": 7.864012277347602e-06, "loss": 0.0018, "num_tokens": 127511193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1836 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6123333333333333, "grad_norm": 9.107178344791578e-10, "kl": 0.04400634765625, "learning_rate": 7.852646728329368e-06, "loss": 0.0018, "num_tokens": 127586009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1837 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6126666666666667, "grad_norm": 1.712260999653381e-09, "kl": 0.04571533203125, "learning_rate": 7.841284086516201e-06, "loss": 0.0018, "num_tokens": 127661241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1838 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.613, "grad_norm": 2.2920261244507856e-09, "kl": 0.04791259765625, "learning_rate": 7.829924367291467e-06, "loss": 0.0019, "num_tokens": 127739785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1839 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6133333333333333, "grad_norm": 2.4824513555898875e-09, "kl": 0.04705810546875, "learning_rate": 7.818567586034578e-06, "loss": 0.0019, "num_tokens": 127816409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1840 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6136666666666667, "grad_norm": 1.1884033757070256e-09, "kl": 0.046142578125, "learning_rate": 7.807213758120965e-06, "loss": 0.0018, "num_tokens": 127891449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1841 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.614, "grad_norm": 1.6770269617438771e-09, "kl": 0.04638671875, "learning_rate": 7.79586289892208e-06, "loss": 0.0019, "num_tokens": 127966569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1842 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6143333333333333, "grad_norm": 2.0313823956286114e-09, "kl": 0.04803466796875, "learning_rate": 7.784515023805328e-06, "loss": 0.0019, "num_tokens": 128042633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1843 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6146666666666667, "grad_norm": 1.2903884627490925e-09, "kl": 0.04656982421875, "learning_rate": 7.773170148134092e-06, "loss": 0.0019, "num_tokens": 128121481.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1844 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.615, "grad_norm": 1.517370007420027e-09, "kl": 0.04510498046875, "learning_rate": 7.761828287267688e-06, "loss": 0.0018, "num_tokens": 128197145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1845 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6153333333333333, "grad_norm": 9.97361304655442e-10, "kl": 0.03985595703125, "learning_rate": 7.750489456561351e-06, "loss": 0.0016, "num_tokens": 128272041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1846 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6156666666666667, "grad_norm": 1.296184604093753e-09, "kl": 0.04620361328125, "learning_rate": 7.739153671366219e-06, "loss": 0.0018, "num_tokens": 128347721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1847 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.616, "grad_norm": 1.082241185557109e-09, "kl": 0.04681396484375, "learning_rate": 7.727820947029289e-06, "loss": 0.0019, "num_tokens": 128421561.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1848 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6163333333333333, "grad_norm": 2.2342969696609316e-09, "kl": 0.0474853515625, "learning_rate": 7.716491298893443e-06, "loss": 0.0019, "num_tokens": 128497817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1849 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6166666666666667, "grad_norm": 3.3846692115702126e-09, "kl": 0.046630859375, "learning_rate": 7.705164742297376e-06, "loss": 0.0019, "num_tokens": 128573609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1850 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.617, "grad_norm": 2.08920103439425e-09, "kl": 0.04498291015625, "learning_rate": 7.6938412925756e-06, "loss": 0.0018, "num_tokens": 128649433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1851 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6173333333333333, "grad_norm": 1.6775133504509654e-09, "kl": 0.0477294921875, "learning_rate": 7.68252096505843e-06, "loss": 0.0019, "num_tokens": 128724537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1852 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6176666666666667, "grad_norm": 8.972145804087006e-10, "kl": 0.0430908203125, "learning_rate": 7.671203775071942e-06, "loss": 0.0017, "num_tokens": 128798953.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1853 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.618, "grad_norm": 1.4025647310234035e-09, "kl": 0.04742431640625, "learning_rate": 7.65988973793798e-06, "loss": 0.0019, "num_tokens": 128873017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1854 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6183333333333333, "grad_norm": 2.208160987393626e-09, "kl": 0.04425048828125, "learning_rate": 7.6485788689741e-06, "loss": 0.0018, "num_tokens": 128949385.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1855 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6186666666666667, "grad_norm": 1.7150909581431506e-09, "kl": 0.04547119140625, "learning_rate": 7.637271183493587e-06, "loss": 0.0018, "num_tokens": 129022089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1856 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.619, "grad_norm": 2.241152818882597e-09, "kl": 0.047119140625, "learning_rate": 7.625966696805406e-06, "loss": 0.0019, "num_tokens": 129098265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1857 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6193333333333333, "grad_norm": 1.4086324329198874e-09, "kl": 0.0438232421875, "learning_rate": 7.6146654242141935e-06, "loss": 0.0018, "num_tokens": 129172265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1858 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6196666666666667, "grad_norm": 1.644347658036338e-09, "kl": 0.0435791015625, "learning_rate": 7.6033673810202314e-06, "loss": 0.0017, "num_tokens": 129249273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1859 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.62, "grad_norm": 1.4028938011279024e-09, "kl": 0.04364013671875, "learning_rate": 7.592072582519437e-06, "loss": 0.0017, "num_tokens": 129323673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1860 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6203333333333333, "grad_norm": 1.7844444810677373e-09, "kl": 0.0482177734375, "learning_rate": 7.580781044003324e-06, "loss": 0.0019, "num_tokens": 129400185.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1861 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6206666666666667, "grad_norm": 1.3033287782349134e-09, "kl": 0.04364013671875, "learning_rate": 7.569492780759002e-06, "loss": 0.0017, "num_tokens": 129474953.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1862 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.621, "grad_norm": 1.7599040003091204e-09, "kl": 0.04864501953125, "learning_rate": 7.558207808069149e-06, "loss": 0.0019, "num_tokens": 129556601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1863 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6213333333333333, "grad_norm": 2.2806734278901786e-09, "kl": 0.04412841796875, "learning_rate": 7.546926141211975e-06, "loss": 0.0018, "num_tokens": 129639817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1864 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6216666666666667, "grad_norm": 1.3473536730757019e-09, "kl": 0.04290771484375, "learning_rate": 7.535647795461224e-06, "loss": 0.0017, "num_tokens": 129714633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1865 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.622, "grad_norm": 2.456862047139907e-09, "kl": 0.04571533203125, "learning_rate": 7.524372786086143e-06, "loss": 0.0018, "num_tokens": 129791273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1866 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6223333333333333, "grad_norm": 1.1364569285632342e-09, "kl": 0.04510498046875, "learning_rate": 7.513101128351454e-06, "loss": 0.0018, "num_tokens": 129865561.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1867 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6226666666666667, "grad_norm": 2.0032744352249665e-09, "kl": 0.04473876953125, "learning_rate": 7.501832837517351e-06, "loss": 0.0018, "num_tokens": 129941689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1868 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.623, "grad_norm": 1.5712428025338454e-09, "kl": 0.04443359375, "learning_rate": 7.490567928839472e-06, "loss": 0.0018, "num_tokens": 130016105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1869 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6233333333333333, "grad_norm": 3.05246272702675e-09, "kl": 0.04888916015625, "learning_rate": 7.4793064175688635e-06, "loss": 0.002, "num_tokens": 130094201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1870 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6236666666666667, "grad_norm": 2.1661195059863303e-09, "kl": 0.04693603515625, "learning_rate": 7.468048318951983e-06, "loss": 0.0019, "num_tokens": 130168665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1871 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.624, "grad_norm": 4.371242923184582e-09, "kl": 0.0457763671875, "learning_rate": 7.4567936482306625e-06, "loss": 0.0018, "num_tokens": 130247961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1872 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6243333333333333, "grad_norm": 1.803898919128244e-09, "kl": 0.04083251953125, "learning_rate": 7.445542420642097e-06, "loss": 0.0016, "num_tokens": 130324073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1873 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6246666666666667, "grad_norm": 1.482397871122032e-09, "kl": 0.04571533203125, "learning_rate": 7.434294651418815e-06, "loss": 0.0018, "num_tokens": 130400249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1874 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.625, "grad_norm": 1.830876561470518e-09, "kl": 0.0418701171875, "learning_rate": 7.423050355788663e-06, "loss": 0.0017, "num_tokens": 130472985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1875 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6253333333333333, "grad_norm": 2.9684679159203142e-09, "kl": 0.04840087890625, "learning_rate": 7.411809548974792e-06, "loss": 0.0019, "num_tokens": 130549097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1876 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6256666666666667, "grad_norm": 1.456159304247251e-09, "kl": 0.04827880859375, "learning_rate": 7.400572246195628e-06, "loss": 0.0019, "num_tokens": 130622953.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1877 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.626, "grad_norm": 1.2934106008444246e-09, "kl": 0.04400634765625, "learning_rate": 7.389338462664841e-06, "loss": 0.0018, "num_tokens": 130699241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1878 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6263333333333333, "grad_norm": 2.85818146927852e-09, "kl": 0.04840087890625, "learning_rate": 7.378108213591355e-06, "loss": 0.0019, "num_tokens": 130776937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1879 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6266666666666667, "grad_norm": 2.1645660819302748e-09, "kl": 0.0469970703125, "learning_rate": 7.366881514179292e-06, "loss": 0.0019, "num_tokens": 130854041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1880 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.627, "grad_norm": 1.1394856169744116e-09, "kl": 0.04351806640625, "learning_rate": 7.355658379627981e-06, "loss": 0.0017, "num_tokens": 130930137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1881 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6273333333333333, "grad_norm": 1.5089685057034785e-09, "kl": 0.04669189453125, "learning_rate": 7.344438825131912e-06, "loss": 0.0019, "num_tokens": 131004217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1882 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6276666666666667, "grad_norm": 2.163821566369961e-09, "kl": 0.045166015625, "learning_rate": 7.333222865880745e-06, "loss": 0.0018, "num_tokens": 131079913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1883 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.628, "grad_norm": 1.5465370095668618e-09, "kl": 0.044677734375, "learning_rate": 7.322010517059256e-06, "loss": 0.0018, "num_tokens": 131159513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1884 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6283333333333333, "grad_norm": 1.2963966566914564e-09, "kl": 0.04302978515625, "learning_rate": 7.310801793847344e-06, "loss": 0.0017, "num_tokens": 131234809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1885 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6286666666666667, "grad_norm": 1.6543231229348976e-09, "kl": 0.04510498046875, "learning_rate": 7.299596711419994e-06, "loss": 0.0018, "num_tokens": 131310921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1886 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.629, "grad_norm": 1.2287947326328208e-09, "kl": 0.044677734375, "learning_rate": 7.288395284947263e-06, "loss": 0.0018, "num_tokens": 131384217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1887 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6293333333333333, "grad_norm": 1.821832795734224e-09, "kl": 0.04644775390625, "learning_rate": 7.277197529594257e-06, "loss": 0.0019, "num_tokens": 131461817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1888 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6296666666666667, "grad_norm": 2.2385659992352203e-09, "kl": 0.04937744140625, "learning_rate": 7.266003460521116e-06, "loss": 0.002, "num_tokens": 131537369.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1889 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.63, "grad_norm": 1.1771449370812093e-09, "kl": 0.05133056640625, "learning_rate": 7.254813092882989e-06, "loss": 0.0021, "num_tokens": 131611497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1890 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6303333333333333, "grad_norm": 2.183838221370138e-09, "kl": 0.0452880859375, "learning_rate": 7.243626441830009e-06, "loss": 0.0018, "num_tokens": 131687081.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1891 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6306666666666667, "grad_norm": 2.6615354364878385e-09, "kl": 0.0458984375, "learning_rate": 7.23244352250728e-06, "loss": 0.0018, "num_tokens": 131765225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1892 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.631, "grad_norm": 2.396752352140652e-09, "kl": 0.0482177734375, "learning_rate": 7.221264350054855e-06, "loss": 0.0019, "num_tokens": 131843225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1893 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6313333333333333, "grad_norm": 1.9081400903786516e-09, "kl": 0.04754638671875, "learning_rate": 7.210088939607709e-06, "loss": 0.0019, "num_tokens": 131919609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1894 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6316666666666667, "grad_norm": 2.6077777715016737e-09, "kl": 0.04376220703125, "learning_rate": 7.1989173062957345e-06, "loss": 0.0017, "num_tokens": 131998281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1895 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.632, "grad_norm": 1.4723079422296337e-09, "kl": 0.0445556640625, "learning_rate": 7.187749465243694e-06, "loss": 0.0018, "num_tokens": 132073673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1896 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6323333333333333, "grad_norm": 1.6434945626642161e-09, "kl": 0.04718017578125, "learning_rate": 7.176585431571235e-06, "loss": 0.0019, "num_tokens": 132148937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1897 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6326666666666667, "grad_norm": 1.477611477618268e-09, "kl": 0.0438232421875, "learning_rate": 7.165425220392839e-06, "loss": 0.0018, "num_tokens": 132223673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1898 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.633, "grad_norm": 1.4106188439555467e-09, "kl": 0.04620361328125, "learning_rate": 7.154268846817812e-06, "loss": 0.0018, "num_tokens": 132296713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1899 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6333333333333333, "grad_norm": 1.6255169432710659e-09, "kl": 0.0469970703125, "learning_rate": 7.143116325950266e-06, "loss": 0.0019, "num_tokens": 132372889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1900 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6336666666666667, "grad_norm": 3.496915867984285e-09, "kl": 0.0445556640625, "learning_rate": 7.131967672889101e-06, "loss": 0.0018, "num_tokens": 132448089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1901 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.634, "grad_norm": 2.3770960755342685e-09, "kl": 0.04522705078125, "learning_rate": 7.120822902727972e-06, "loss": 0.0018, "num_tokens": 132526057.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1902 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6343333333333333, "grad_norm": 1.4412132598451421e-09, "kl": 0.04779052734375, "learning_rate": 7.109682030555283e-06, "loss": 0.0019, "num_tokens": 132600793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1903 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6346666666666667, "grad_norm": 1.6879673214731383e-09, "kl": 0.04522705078125, "learning_rate": 7.0985450714541685e-06, "loss": 0.0018, "num_tokens": 132674953.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1904 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.635, "grad_norm": 1.508832281338357e-09, "kl": 0.0445556640625, "learning_rate": 7.087412040502446e-06, "loss": 0.0018, "num_tokens": 132751001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1905 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6353333333333333, "grad_norm": 1.192776766245629e-09, "kl": 0.04534912109375, "learning_rate": 7.076282952772634e-06, "loss": 0.0018, "num_tokens": 132825817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1906 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6356666666666667, "grad_norm": 1.5989748414213523e-09, "kl": 0.04620361328125, "learning_rate": 7.0651578233318986e-06, "loss": 0.0019, "num_tokens": 132901945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1907 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.636, "grad_norm": 1.14927933836384e-09, "kl": 0.049560546875, "learning_rate": 7.054036667242055e-06, "loss": 0.002, "num_tokens": 132977033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1908 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6363333333333333, "grad_norm": 1.4603932507739614e-09, "kl": 0.050048828125, "learning_rate": 7.042919499559538e-06, "loss": 0.002, "num_tokens": 133052217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1909 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6366666666666667, "grad_norm": 1.643143843210737e-09, "kl": 0.0472412109375, "learning_rate": 7.031806335335372e-06, "loss": 0.0019, "num_tokens": 133128409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1910 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.637, "grad_norm": 1.7449706124494924e-09, "kl": 0.04669189453125, "learning_rate": 7.02069718961518e-06, "loss": 0.0019, "num_tokens": 133203273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1911 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6373333333333333, "grad_norm": 1.849943642717733e-09, "kl": 0.0428466796875, "learning_rate": 7.009592077439135e-06, "loss": 0.0017, "num_tokens": 133280057.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1912 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6376666666666667, "grad_norm": 1.491166523592824e-09, "kl": 0.04681396484375, "learning_rate": 6.9984910138419434e-06, "loss": 0.0019, "num_tokens": 133354969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1913 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.638, "grad_norm": 1.8732115858455245e-09, "kl": 0.04644775390625, "learning_rate": 6.987394013852843e-06, "loss": 0.0019, "num_tokens": 133431449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1914 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6383333333333333, "grad_norm": 1.2184496744893636e-09, "kl": 0.044189453125, "learning_rate": 6.976301092495556e-06, "loss": 0.0018, "num_tokens": 133506713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1915 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6386666666666667, "grad_norm": 1.3719752001151164e-09, "kl": 0.04656982421875, "learning_rate": 6.9652122647882966e-06, "loss": 0.0019, "num_tokens": 133585721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1916 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.639, "grad_norm": 2.07307415678315e-09, "kl": 0.044677734375, "learning_rate": 6.9541275457437215e-06, "loss": 0.0018, "num_tokens": 133662985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1917 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6393333333333333, "grad_norm": 1.8676311608345486e-09, "kl": 0.04876708984375, "learning_rate": 6.943046950368944e-06, "loss": 0.0019, "num_tokens": 133740041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1918 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6396666666666667, "grad_norm": 1.234923496795659e-09, "kl": 0.04364013671875, "learning_rate": 6.931970493665478e-06, "loss": 0.0017, "num_tokens": 133813305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1919 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.64, "grad_norm": 9.19981757441235e-10, "kl": 0.04339599609375, "learning_rate": 6.920898190629242e-06, "loss": 0.0017, "num_tokens": 133891305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1920 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6403333333333333, "grad_norm": 2.7641804400957426e-09, "kl": 0.046630859375, "learning_rate": 6.909830056250527e-06, "loss": 0.0019, "num_tokens": 133969673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1921 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6406666666666667, "grad_norm": 1.6468314489870295e-09, "kl": 0.04010009765625, "learning_rate": 6.8987661055139865e-06, "loss": 0.0016, "num_tokens": 134046041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1922 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.641, "grad_norm": 2.581733493656202e-09, "kl": 0.0498046875, "learning_rate": 6.8877063533986025e-06, "loss": 0.002, "num_tokens": 134125065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1923 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6413333333333333, "grad_norm": 1.9296342301799996e-09, "kl": 0.045654296875, "learning_rate": 6.876650814877675e-06, "loss": 0.0018, "num_tokens": 134205257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1924 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6416666666666667, "grad_norm": 2.031419032988424e-09, "kl": 0.046142578125, "learning_rate": 6.865599504918805e-06, "loss": 0.0018, "num_tokens": 134281545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1925 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.642, "grad_norm": 2.3758641720661444e-09, "kl": 0.0433349609375, "learning_rate": 6.854552438483866e-06, "loss": 0.0017, "num_tokens": 134363017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1926 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6423333333333333, "grad_norm": 1.1521709142314762e-09, "kl": 0.04876708984375, "learning_rate": 6.843509630528977e-06, "loss": 0.002, "num_tokens": 134438025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1927 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6426666666666667, "grad_norm": 2.387333442044337e-09, "kl": 0.04522705078125, "learning_rate": 6.832471096004505e-06, "loss": 0.0018, "num_tokens": 134514441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1928 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.643, "grad_norm": 2.3476260935240134e-09, "kl": 0.04644775390625, "learning_rate": 6.821436849855023e-06, "loss": 0.0019, "num_tokens": 134589913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1929 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6433333333333333, "grad_norm": 1.4283827454164566e-09, "kl": 0.041748046875, "learning_rate": 6.8104069070193e-06, "loss": 0.0017, "num_tokens": 134664409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1930 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6436666666666667, "grad_norm": 1.7663965845571283e-09, "kl": 0.04443359375, "learning_rate": 6.799381282430284e-06, "loss": 0.0018, "num_tokens": 134739753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1931 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.644, "grad_norm": 1.4734766740076566e-09, "kl": 0.04278564453125, "learning_rate": 6.78835999101507e-06, "loss": 0.0017, "num_tokens": 134815129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1932 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6443333333333333, "grad_norm": 2.112878982885036e-09, "kl": 0.04937744140625, "learning_rate": 6.777343047694891e-06, "loss": 0.002, "num_tokens": 134892665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1933 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6446666666666667, "grad_norm": 9.772184172973652e-10, "kl": 0.04364013671875, "learning_rate": 6.766330467385088e-06, "loss": 0.0017, "num_tokens": 134967289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1934 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.645, "grad_norm": 1.7630332749263289e-09, "kl": 0.04925537109375, "learning_rate": 6.755322264995099e-06, "loss": 0.002, "num_tokens": 135041529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1935 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6453333333333333, "grad_norm": 1.7300717525259302e-09, "kl": 0.0479736328125, "learning_rate": 6.744318455428436e-06, "loss": 0.0019, "num_tokens": 135116697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1936 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6456666666666667, "grad_norm": 1.5565276845208587e-09, "kl": 0.04833984375, "learning_rate": 6.733319053582659e-06, "loss": 0.0019, "num_tokens": 135192057.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1937 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.646, "grad_norm": 2.372598117972302e-09, "kl": 0.05157470703125, "learning_rate": 6.722324074349367e-06, "loss": 0.0021, "num_tokens": 135268441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1938 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6463333333333333, "grad_norm": 1.917065617362823e-09, "kl": 0.047607421875, "learning_rate": 6.711333532614168e-06, "loss": 0.0019, "num_tokens": 135343945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1939 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6466666666666666, "grad_norm": 1.2906954394154013e-09, "kl": 0.04449462890625, "learning_rate": 6.700347443256661e-06, "loss": 0.0018, "num_tokens": 135419961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1940 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.647, "grad_norm": 2.011222743902863e-09, "kl": 0.04376220703125, "learning_rate": 6.689365821150421e-06, "loss": 0.0018, "num_tokens": 135496089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1941 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6473333333333333, "grad_norm": 1.7532705287592876e-09, "kl": 0.0498046875, "learning_rate": 6.67838868116297e-06, "loss": 0.002, "num_tokens": 135569785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1942 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6476666666666666, "grad_norm": 1.4899961264802641e-09, "kl": 0.045166015625, "learning_rate": 6.667416038155763e-06, "loss": 0.0018, "num_tokens": 135643817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1943 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.648, "grad_norm": 1.268116722741297e-09, "kl": 0.04132080078125, "learning_rate": 6.656447906984168e-06, "loss": 0.0017, "num_tokens": 135717033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1944 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6483333333333333, "grad_norm": 9.861612637607209e-10, "kl": 0.04486083984375, "learning_rate": 6.645484302497452e-06, "loss": 0.0018, "num_tokens": 135791497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1945 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6486666666666666, "grad_norm": 1.6581522821468297e-09, "kl": 0.04864501953125, "learning_rate": 6.634525239538736e-06, "loss": 0.0019, "num_tokens": 135867577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1946 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.649, "grad_norm": 1.5769037187141066e-09, "kl": 0.04638671875, "learning_rate": 6.623570732945012e-06, "loss": 0.0019, "num_tokens": 135941721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1947 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6493333333333333, "grad_norm": 1.4770659140239673e-09, "kl": 0.04473876953125, "learning_rate": 6.612620797547087e-06, "loss": 0.0018, "num_tokens": 136016985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1948 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6496666666666666, "grad_norm": 1.8653953937075585e-09, "kl": 0.043701171875, "learning_rate": 6.601675448169591e-06, "loss": 0.0017, "num_tokens": 136092713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1949 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.65, "grad_norm": 1.2497890500284825e-09, "kl": 0.0428466796875, "learning_rate": 6.590734699630939e-06, "loss": 0.0017, "num_tokens": 136166905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1950 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6503333333333333, "grad_norm": 1.3450764946298932e-09, "kl": 0.045654296875, "learning_rate": 6.579798566743314e-06, "loss": 0.0018, "num_tokens": 136241785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1951 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6506666666666666, "grad_norm": 1.7639859573037597e-09, "kl": 0.0438232421875, "learning_rate": 6.568867064312661e-06, "loss": 0.0018, "num_tokens": 136320137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1952 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.651, "grad_norm": 1.5360064331559897e-09, "kl": 0.0450439453125, "learning_rate": 6.5579402071386485e-06, "loss": 0.0018, "num_tokens": 136394697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1953 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6513333333333333, "grad_norm": 2.8081055258866172e-09, "kl": 0.04486083984375, "learning_rate": 6.547018010014654e-06, "loss": 0.0018, "num_tokens": 136474649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1954 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6516666666666666, "grad_norm": 1.4487635535687104e-09, "kl": 0.045166015625, "learning_rate": 6.536100487727754e-06, "loss": 0.0018, "num_tokens": 136548441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1955 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.652, "grad_norm": 1.3795813380568234e-09, "kl": 0.0482177734375, "learning_rate": 6.525187655058687e-06, "loss": 0.0019, "num_tokens": 136623465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1956 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6523333333333333, "grad_norm": 1.2294382178978935e-09, "kl": 0.0435791015625, "learning_rate": 6.5142795267818505e-06, "loss": 0.0017, "num_tokens": 136696457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1957 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6526666666666666, "grad_norm": 2.855998992856712e-09, "kl": 0.04486083984375, "learning_rate": 6.503376117665262e-06, "loss": 0.0018, "num_tokens": 136777929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1958 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.653, "grad_norm": 1.5704763045576442e-09, "kl": 0.044921875, "learning_rate": 6.492477442470566e-06, "loss": 0.0018, "num_tokens": 136854201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1959 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6533333333333333, "grad_norm": 1.36952138518609e-09, "kl": 0.04510498046875, "learning_rate": 6.481583515952983e-06, "loss": 0.0018, "num_tokens": 136927865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1960 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6536666666666666, "grad_norm": 1.4903973610813637e-09, "kl": 0.04425048828125, "learning_rate": 6.4706943528613135e-06, "loss": 0.0018, "num_tokens": 137002473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1961 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.654, "grad_norm": 1.578972952387403e-09, "kl": 0.04345703125, "learning_rate": 6.4598099679379024e-06, "loss": 0.0017, "num_tokens": 137076761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1962 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6543333333333333, "grad_norm": 2.420224687327277e-09, "kl": 0.043212890625, "learning_rate": 6.448930375918632e-06, "loss": 0.0017, "num_tokens": 137153625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1963 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6546666666666666, "grad_norm": 1.6870497221432856e-09, "kl": 0.0423583984375, "learning_rate": 6.43805559153289e-06, "loss": 0.0017, "num_tokens": 137233321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1964 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.655, "grad_norm": 2.0265491507132083e-09, "kl": 0.04534912109375, "learning_rate": 6.427185629503561e-06, "loss": 0.0018, "num_tokens": 137310649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1965 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6553333333333333, "grad_norm": 1.087271717103988e-09, "kl": 0.04351806640625, "learning_rate": 6.4163205045469975e-06, "loss": 0.0017, "num_tokens": 137384681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1966 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6556666666666666, "grad_norm": 1.3223340200596567e-09, "kl": 0.04638671875, "learning_rate": 6.405460231373003e-06, "loss": 0.0019, "num_tokens": 137459017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1967 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.656, "grad_norm": 1.7804193674919588e-09, "kl": 0.04559326171875, "learning_rate": 6.394604824684815e-06, "loss": 0.0018, "num_tokens": 137534169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1968 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6563333333333333, "grad_norm": 1.5786283391605593e-09, "kl": 0.04437255859375, "learning_rate": 6.383754299179079e-06, "loss": 0.0018, "num_tokens": 137612953.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1969 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6566666666666666, "grad_norm": 1.060493803883844e-09, "kl": 0.0469970703125, "learning_rate": 6.372908669545832e-06, "loss": 0.0019, "num_tokens": 137687145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1970 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.657, "grad_norm": 1.2245715552694492e-09, "kl": 0.0482177734375, "learning_rate": 6.362067950468489e-06, "loss": 0.0019, "num_tokens": 137761769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1971 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6573333333333333, "grad_norm": 1.6057961627069517e-09, "kl": 0.04473876953125, "learning_rate": 6.351232156623803e-06, "loss": 0.0018, "num_tokens": 137835689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1972 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6576666666666666, "grad_norm": 1.0489554780335197e-09, "kl": 0.04534912109375, "learning_rate": 6.340401302681879e-06, "loss": 0.0018, "num_tokens": 137911513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1973 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.658, "grad_norm": 1.6948649150805295e-09, "kl": 0.04632568359375, "learning_rate": 6.3295754033061196e-06, "loss": 0.0019, "num_tokens": 137987129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1974 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6583333333333333, "grad_norm": 2.8825064557480573e-09, "kl": 0.04730224609375, "learning_rate": 6.318754473153221e-06, "loss": 0.0019, "num_tokens": 138063849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1975 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6586666666666666, "grad_norm": 1.1305566482988638e-09, "kl": 0.0413818359375, "learning_rate": 6.3079385268731575e-06, "loss": 0.0017, "num_tokens": 138142169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1976 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.659, "grad_norm": 1.2178956732000756e-09, "kl": 0.0472412109375, "learning_rate": 6.29712757910915e-06, "loss": 0.0019, "num_tokens": 138216697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1977 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6593333333333333, "grad_norm": 2.704009460785528e-09, "kl": 0.04693603515625, "learning_rate": 6.286321644497655e-06, "loss": 0.0019, "num_tokens": 138293513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1978 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6596666666666666, "grad_norm": 1.8339284535429101e-09, "kl": 0.04638671875, "learning_rate": 6.275520737668338e-06, "loss": 0.0019, "num_tokens": 138370265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1979 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.66, "grad_norm": 2.5290105565289878e-09, "kl": 0.0484619140625, "learning_rate": 6.26472487324407e-06, "loss": 0.0019, "num_tokens": 138446601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1980 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6603333333333333, "grad_norm": 1.7671978325140003e-09, "kl": 0.04559326171875, "learning_rate": 6.25393406584088e-06, "loss": 0.0018, "num_tokens": 138523417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1981 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6606666666666666, "grad_norm": 2.218245143126296e-09, "kl": 0.04693603515625, "learning_rate": 6.243148330067961e-06, "loss": 0.0019, "num_tokens": 138598041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1982 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.661, "grad_norm": 1.7759036463615985e-09, "kl": 0.05120849609375, "learning_rate": 6.2323676805276315e-06, "loss": 0.0021, "num_tokens": 138672905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1983 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6613333333333333, "grad_norm": 1.692885165383018e-09, "kl": 0.04510498046875, "learning_rate": 6.22159213181533e-06, "loss": 0.0018, "num_tokens": 138748617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1984 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6616666666666666, "grad_norm": 1.8566247428353222e-09, "kl": 0.0465087890625, "learning_rate": 6.210821698519592e-06, "loss": 0.0019, "num_tokens": 138823833.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1985 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.662, "grad_norm": 2.2181931846887437e-09, "kl": 0.04278564453125, "learning_rate": 6.200056395222012e-06, "loss": 0.0017, "num_tokens": 138901513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1986 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6623333333333333, "grad_norm": 2.2822197465188765e-09, "kl": 0.04473876953125, "learning_rate": 6.18929623649726e-06, "loss": 0.0018, "num_tokens": 138976569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1987 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6626666666666666, "grad_norm": 1.4739386378082031e-09, "kl": 0.04766845703125, "learning_rate": 6.178541236913029e-06, "loss": 0.0019, "num_tokens": 139051465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1988 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.663, "grad_norm": 1.6366986654858806e-09, "kl": 0.04425048828125, "learning_rate": 6.167791411030027e-06, "loss": 0.0018, "num_tokens": 139127529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1989 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6633333333333333, "grad_norm": 1.3695743428243645e-09, "kl": 0.049072265625, "learning_rate": 6.157046773401964e-06, "loss": 0.002, "num_tokens": 139201753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1990 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6636666666666666, "grad_norm": 2.369603624430283e-09, "kl": 0.04461669921875, "learning_rate": 6.146307338575519e-06, "loss": 0.0018, "num_tokens": 139279497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1991 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.664, "grad_norm": 1.6871206653945592e-09, "kl": 0.04547119140625, "learning_rate": 6.135573121090327e-06, "loss": 0.0018, "num_tokens": 139356137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1992 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6643333333333333, "grad_norm": 1.5836225664145331e-09, "kl": 0.04949951171875, "learning_rate": 6.124844135478971e-06, "loss": 0.002, "num_tokens": 139433321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1993 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6646666666666666, "grad_norm": 2.220256423157707e-09, "kl": 0.04766845703125, "learning_rate": 6.114120396266936e-06, "loss": 0.0019, "num_tokens": 139509401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1994 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.665, "grad_norm": 1.900301693780193e-09, "kl": 0.04791259765625, "learning_rate": 6.1034019179726115e-06, "loss": 0.0019, "num_tokens": 139586217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1995 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6653333333333333, "grad_norm": 8.209671831238552e-10, "kl": 0.04547119140625, "learning_rate": 6.092688715107265e-06, "loss": 0.0018, "num_tokens": 139661241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1996 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6656666666666666, "grad_norm": 9.694834934848018e-10, "kl": 0.04638671875, "learning_rate": 6.081980802175016e-06, "loss": 0.0019, "num_tokens": 139735881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1997 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.666, "grad_norm": 1.608321920087974e-09, "kl": 0.04681396484375, "learning_rate": 6.071278193672834e-06, "loss": 0.0019, "num_tokens": 139811689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1998 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6663333333333333, "grad_norm": 1.3388211650422477e-09, "kl": 0.04205322265625, "learning_rate": 6.06058090409049e-06, "loss": 0.0017, "num_tokens": 139886649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1999 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6666666666666666, "grad_norm": 1.0311029807752448e-09, "kl": 0.0474853515625, "learning_rate": 6.049888947910569e-06, "loss": 0.0019, "num_tokens": 139961113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2000 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.667, "grad_norm": 1.3561779477200275e-09, "kl": 0.04296875, "learning_rate": 6.039202339608432e-06, "loss": 0.0017, "num_tokens": 140034249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2001 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6673333333333333, "grad_norm": 1.8810895063836597e-09, "kl": 0.04254150390625, "learning_rate": 6.028521093652195e-06, "loss": 0.0017, "num_tokens": 140108361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2002 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6676666666666666, "grad_norm": 1.844388641814021e-09, "kl": 0.045654296875, "learning_rate": 6.0178452245027165e-06, "loss": 0.0018, "num_tokens": 140184745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2003 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.668, "grad_norm": 2.4463555625686695e-09, "kl": 0.04541015625, "learning_rate": 6.007174746613576e-06, "loss": 0.0018, "num_tokens": 140263017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2004 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6683333333333333, "grad_norm": 1.571687002765998e-09, "kl": 0.04901123046875, "learning_rate": 5.996509674431053e-06, "loss": 0.002, "num_tokens": 140339609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2005 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6686666666666666, "grad_norm": 1.5943716347166514e-09, "kl": 0.04266357421875, "learning_rate": 5.9858500223941066e-06, "loss": 0.0017, "num_tokens": 140415753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2006 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.669, "grad_norm": 1.707277763607351e-09, "kl": 0.04742431640625, "learning_rate": 5.975195804934369e-06, "loss": 0.0019, "num_tokens": 140497497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2007 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6693333333333333, "grad_norm": 1.3581950009111665e-09, "kl": 0.04461669921875, "learning_rate": 5.9645470364761e-06, "loss": 0.0018, "num_tokens": 140573241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2008 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6696666666666666, "grad_norm": 1.9264798645224346e-09, "kl": 0.0455322265625, "learning_rate": 5.953903731436191e-06, "loss": 0.0018, "num_tokens": 140650793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2009 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.67, "grad_norm": 2.4505941720320834e-09, "kl": 0.03790283203125, "learning_rate": 5.943265904224133e-06, "loss": 0.0015, "num_tokens": 140728105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2010 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6703333333333333, "grad_norm": 1.2943666138909293e-09, "kl": 0.048583984375, "learning_rate": 5.932633569242e-06, "loss": 0.0019, "num_tokens": 140801273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2011 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6706666666666666, "grad_norm": 1.3821218614040731e-09, "kl": 0.0423583984375, "learning_rate": 5.922006740884436e-06, "loss": 0.0017, "num_tokens": 140875241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2012 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.671, "grad_norm": 2.01482985850987e-09, "kl": 0.0430908203125, "learning_rate": 5.911385433538621e-06, "loss": 0.0017, "num_tokens": 140951657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2013 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6713333333333333, "grad_norm": 1.3315020197524063e-09, "kl": 0.04400634765625, "learning_rate": 5.900769661584273e-06, "loss": 0.0018, "num_tokens": 141026185.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2014 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6716666666666666, "grad_norm": 1.3825495193131587e-09, "kl": 0.04791259765625, "learning_rate": 5.890159439393604e-06, "loss": 0.0019, "num_tokens": 141101081.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2015 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.672, "grad_norm": 1.579926633965556e-09, "kl": 0.046630859375, "learning_rate": 5.879554781331317e-06, "loss": 0.0019, "num_tokens": 141174681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2016 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6723333333333333, "grad_norm": 1.5863337310406678e-09, "kl": 0.04119873046875, "learning_rate": 5.868955701754584e-06, "loss": 0.0016, "num_tokens": 141249417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2017 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6726666666666666, "grad_norm": 2.0115535903642012e-09, "kl": 0.04156494140625, "learning_rate": 5.858362215013018e-06, "loss": 0.0017, "num_tokens": 141327305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2018 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.673, "grad_norm": 1.6781062095461152e-09, "kl": 0.044921875, "learning_rate": 5.847774335448671e-06, "loss": 0.0018, "num_tokens": 141403417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2019 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6733333333333333, "grad_norm": 1.6332654118045298e-09, "kl": 0.04815673828125, "learning_rate": 5.83719207739599e-06, "loss": 0.0019, "num_tokens": 141485225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2020 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6736666666666666, "grad_norm": 2.074878269198166e-09, "kl": 0.04534912109375, "learning_rate": 5.8266154551818225e-06, "loss": 0.0018, "num_tokens": 141561497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2021 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.674, "grad_norm": 2.0747485951488898e-09, "kl": 0.042236328125, "learning_rate": 5.816044483125381e-06, "loss": 0.0017, "num_tokens": 141637273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2022 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6743333333333333, "grad_norm": 1.8546603142155504e-09, "kl": 0.0482177734375, "learning_rate": 5.8054791755382286e-06, "loss": 0.0019, "num_tokens": 141712553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2023 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6746666666666666, "grad_norm": 1.969717944305671e-09, "kl": 0.0482177734375, "learning_rate": 5.7949195467242654e-06, "loss": 0.0019, "num_tokens": 141787737.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2024 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.675, "grad_norm": 1.5042752599114806e-09, "kl": 0.04595947265625, "learning_rate": 5.784365610979692e-06, "loss": 0.0018, "num_tokens": 141861449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2025 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6753333333333333, "grad_norm": 2.8594180356833476e-09, "kl": 0.04730224609375, "learning_rate": 5.773817382593008e-06, "loss": 0.0019, "num_tokens": 141939705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2026 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6756666666666666, "grad_norm": 1.5209469239607643e-09, "kl": 0.04730224609375, "learning_rate": 5.7632748758449865e-06, "loss": 0.0019, "num_tokens": 142013753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2027 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.676, "grad_norm": 1.316886599767031e-09, "kl": 0.04486083984375, "learning_rate": 5.7527381050086555e-06, "loss": 0.0018, "num_tokens": 142088249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2028 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6763333333333333, "grad_norm": 1.4880415788454115e-09, "kl": 0.04595947265625, "learning_rate": 5.742207084349274e-06, "loss": 0.0018, "num_tokens": 142161705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2029 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6766666666666666, "grad_norm": 2.9831463965734883e-09, "kl": 0.04656982421875, "learning_rate": 5.73168182812432e-06, "loss": 0.0019, "num_tokens": 142240377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2030 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.677, "grad_norm": 1.5522306773263495e-09, "kl": 0.04071044921875, "learning_rate": 5.72116235058346e-06, "loss": 0.0016, "num_tokens": 142314281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2031 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6773333333333333, "grad_norm": 1.1397017773973062e-09, "kl": 0.04840087890625, "learning_rate": 5.710648665968543e-06, "loss": 0.0019, "num_tokens": 142389417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2032 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6776666666666666, "grad_norm": 1.2189036446841328e-09, "kl": 0.0494384765625, "learning_rate": 5.700140788513575e-06, "loss": 0.002, "num_tokens": 142463945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2033 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.678, "grad_norm": 2.6483322201897863e-09, "kl": 0.04705810546875, "learning_rate": 5.689638732444699e-06, "loss": 0.0019, "num_tokens": 142542073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2034 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6783333333333333, "grad_norm": 1.1243739272970288e-09, "kl": 0.044189453125, "learning_rate": 5.679142511980176e-06, "loss": 0.0018, "num_tokens": 142615977.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2035 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6786666666666666, "grad_norm": 1.3952109467751939e-09, "kl": 0.04559326171875, "learning_rate": 5.668652141330373e-06, "loss": 0.0018, "num_tokens": 142691065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2036 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.679, "grad_norm": 3.4144658211943124e-09, "kl": 0.048583984375, "learning_rate": 5.65816763469772e-06, "loss": 0.0019, "num_tokens": 142768761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2037 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6793333333333333, "grad_norm": 1.879545852290221e-09, "kl": 0.04730224609375, "learning_rate": 5.647689006276727e-06, "loss": 0.0019, "num_tokens": 142848713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2038 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6796666666666666, "grad_norm": 2.209214589043995e-09, "kl": 0.04541015625, "learning_rate": 5.637216270253934e-06, "loss": 0.0018, "num_tokens": 142925785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2039 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.68, "grad_norm": 1.7563561716116283e-09, "kl": 0.04833984375, "learning_rate": 5.626749440807915e-06, "loss": 0.0019, "num_tokens": 143001609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2040 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6803333333333333, "grad_norm": 1.4760456190643367e-09, "kl": 0.04571533203125, "learning_rate": 5.616288532109225e-06, "loss": 0.0018, "num_tokens": 143077801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2041 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6806666666666666, "grad_norm": 1.5510832618303994e-09, "kl": 0.0435791015625, "learning_rate": 5.605833558320432e-06, "loss": 0.0017, "num_tokens": 143151849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2042 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.681, "grad_norm": 1.6523277190927388e-09, "kl": 0.0428466796875, "learning_rate": 5.595384533596054e-06, "loss": 0.0017, "num_tokens": 143226073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2043 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6813333333333333, "grad_norm": 2.4642576867961452e-09, "kl": 0.0469970703125, "learning_rate": 5.584941472082549e-06, "loss": 0.0019, "num_tokens": 143304521.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2044 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6816666666666666, "grad_norm": 2.030916324002874e-09, "kl": 0.04351806640625, "learning_rate": 5.574504387918311e-06, "loss": 0.0017, "num_tokens": 143380617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2045 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.682, "grad_norm": 1.3033975010401377e-09, "kl": 0.04571533203125, "learning_rate": 5.564073295233645e-06, "loss": 0.0018, "num_tokens": 143454569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2046 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6823333333333333, "grad_norm": 2.7357243137515752e-09, "kl": 0.04534912109375, "learning_rate": 5.553648208150728e-06, "loss": 0.0018, "num_tokens": 143532873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2047 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6826666666666666, "grad_norm": 1.582678210709787e-09, "kl": 0.045166015625, "learning_rate": 5.543229140783619e-06, "loss": 0.0018, "num_tokens": 143607145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2048 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.683, "grad_norm": 1.2580937402972836e-09, "kl": 0.04180908203125, "learning_rate": 5.5328161072382355e-06, "loss": 0.0017, "num_tokens": 143682169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2049 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6833333333333333, "grad_norm": 1.1265001154114884e-09, "kl": 0.04913330078125, "learning_rate": 5.522409121612304e-06, "loss": 0.002, "num_tokens": 143756825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2050 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6836666666666666, "grad_norm": 1.9295671727093122e-09, "kl": 0.04644775390625, "learning_rate": 5.512008197995379e-06, "loss": 0.0019, "num_tokens": 143831769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2051 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.684, "grad_norm": 2.35977948292998e-09, "kl": 0.04443359375, "learning_rate": 5.501613350468802e-06, "loss": 0.0018, "num_tokens": 143907161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2052 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6843333333333333, "grad_norm": 1.4888303923044077e-09, "kl": 0.0421142578125, "learning_rate": 5.491224593105695e-06, "loss": 0.0017, "num_tokens": 143986169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2053 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6846666666666666, "grad_norm": 1.4937405756754174e-09, "kl": 0.04412841796875, "learning_rate": 5.480841939970918e-06, "loss": 0.0018, "num_tokens": 144062345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2054 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.685, "grad_norm": 1.4774684808926963e-09, "kl": 0.0465087890625, "learning_rate": 5.470465405121093e-06, "loss": 0.0019, "num_tokens": 144139593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2055 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6853333333333333, "grad_norm": 2.318879754881209e-09, "kl": 0.045166015625, "learning_rate": 5.460095002604533e-06, "loss": 0.0018, "num_tokens": 144214697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2056 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6856666666666666, "grad_norm": 2.032983337230121e-09, "kl": 0.04827880859375, "learning_rate": 5.449730746461264e-06, "loss": 0.0019, "num_tokens": 144290169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2057 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.686, "grad_norm": 2.985671487820696e-09, "kl": 0.04425048828125, "learning_rate": 5.439372650722985e-06, "loss": 0.0018, "num_tokens": 144365353.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2058 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6863333333333334, "grad_norm": 1.6914258882394506e-09, "kl": 0.0467529296875, "learning_rate": 5.429020729413062e-06, "loss": 0.0019, "num_tokens": 144441353.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2059 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6866666666666666, "grad_norm": 1.898779133924222e-09, "kl": 0.04412841796875, "learning_rate": 5.418674996546486e-06, "loss": 0.0018, "num_tokens": 144517785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2060 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.687, "grad_norm": 1.4402256054424356e-09, "kl": 0.045654296875, "learning_rate": 5.4083354661298816e-06, "loss": 0.0018, "num_tokens": 144592841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2061 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6873333333333334, "grad_norm": 1.7919947747913056e-09, "kl": 0.04840087890625, "learning_rate": 5.398002152161484e-06, "loss": 0.0019, "num_tokens": 144668841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2062 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6876666666666666, "grad_norm": 2.1313002473988263e-09, "kl": 0.0439453125, "learning_rate": 5.387675068631093e-06, "loss": 0.0018, "num_tokens": 144745113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2063 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.688, "grad_norm": 1.473969835075195e-09, "kl": 0.0474853515625, "learning_rate": 5.377354229520086e-06, "loss": 0.0019, "num_tokens": 144819417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2064 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6883333333333334, "grad_norm": 1.8933425938172377e-09, "kl": 0.044921875, "learning_rate": 5.367039648801386e-06, "loss": 0.0018, "num_tokens": 144896825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2065 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6886666666666666, "grad_norm": 1.482761025073387e-09, "kl": 0.0489501953125, "learning_rate": 5.356731340439432e-06, "loss": 0.002, "num_tokens": 144972729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2066 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.689, "grad_norm": 1.2817790162600318e-09, "kl": 0.04937744140625, "learning_rate": 5.346429318390185e-06, "loss": 0.002, "num_tokens": 145045913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2067 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6893333333333334, "grad_norm": 1.8539841883935537e-09, "kl": 0.05120849609375, "learning_rate": 5.336133596601089e-06, "loss": 0.002, "num_tokens": 145120665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2068 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6896666666666667, "grad_norm": 1.4539929260592999e-09, "kl": 0.046630859375, "learning_rate": 5.325844189011058e-06, "loss": 0.0019, "num_tokens": 145195001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2069 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.69, "grad_norm": 3.0194735600730382e-09, "kl": 0.044921875, "learning_rate": 5.31556110955046e-06, "loss": 0.0018, "num_tokens": 145272713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2070 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6903333333333334, "grad_norm": 1.479883660060466e-09, "kl": 0.0426025390625, "learning_rate": 5.305284372141095e-06, "loss": 0.0017, "num_tokens": 145348745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2071 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6906666666666667, "grad_norm": 1.2946799188284785e-09, "kl": 0.04815673828125, "learning_rate": 5.2950139906961716e-06, "loss": 0.0019, "num_tokens": 145422921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2072 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.691, "grad_norm": 1.7997818790860265e-09, "kl": 0.04925537109375, "learning_rate": 5.284749979120299e-06, "loss": 0.002, "num_tokens": 145498569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2073 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6913333333333334, "grad_norm": 1.3114987984508275e-09, "kl": 0.0455322265625, "learning_rate": 5.274492351309462e-06, "loss": 0.0018, "num_tokens": 145573865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2074 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6916666666666667, "grad_norm": 1.2795651205266267e-09, "kl": 0.04296875, "learning_rate": 5.2642411211510005e-06, "loss": 0.0017, "num_tokens": 145653497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2075 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.692, "grad_norm": 1.3581488156333421e-09, "kl": 0.0458984375, "learning_rate": 5.253996302523596e-06, "loss": 0.0018, "num_tokens": 145727097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2076 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6923333333333334, "grad_norm": 1.5171742751007855e-09, "kl": 0.04449462890625, "learning_rate": 5.243757909297247e-06, "loss": 0.0018, "num_tokens": 145801849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2077 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6926666666666667, "grad_norm": 1.035644126012869e-09, "kl": 0.045654296875, "learning_rate": 5.233525955333258e-06, "loss": 0.0018, "num_tokens": 145876761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2078 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.693, "grad_norm": 1.6071520780869264e-09, "kl": 0.04132080078125, "learning_rate": 5.223300454484204e-06, "loss": 0.0017, "num_tokens": 145953289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2079 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6933333333333334, "grad_norm": 1.9325581135376524e-09, "kl": 0.04681396484375, "learning_rate": 5.213081420593933e-06, "loss": 0.0019, "num_tokens": 146028025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2080 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6936666666666667, "grad_norm": 1.6818265669016341e-09, "kl": 0.05078125, "learning_rate": 5.202868867497542e-06, "loss": 0.002, "num_tokens": 146103769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2081 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.694, "grad_norm": 1.5958325771947557e-09, "kl": 0.04736328125, "learning_rate": 5.192662809021334e-06, "loss": 0.0019, "num_tokens": 146179113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2082 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6943333333333334, "grad_norm": 1.9440562493144853e-09, "kl": 0.0452880859375, "learning_rate": 5.1824632589828465e-06, "loss": 0.0018, "num_tokens": 146263369.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2083 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6946666666666667, "grad_norm": 2.4925215225124475e-09, "kl": 0.044677734375, "learning_rate": 5.172270231190789e-06, "loss": 0.0018, "num_tokens": 146339257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2084 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.695, "grad_norm": 1.077152811390647e-09, "kl": 0.04925537109375, "learning_rate": 5.162083739445038e-06, "loss": 0.002, "num_tokens": 146414569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2085 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6953333333333334, "grad_norm": 1.532193483200217e-09, "kl": 0.041046142578125, "learning_rate": 5.151903797536631e-06, "loss": 0.0016, "num_tokens": 146491769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2086 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6956666666666667, "grad_norm": 1.783924008513793e-09, "kl": 0.047607421875, "learning_rate": 5.141730419247735e-06, "loss": 0.0019, "num_tokens": 146566969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2087 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.696, "grad_norm": 1.5400581920843592e-09, "kl": 0.0465087890625, "learning_rate": 5.131563618351624e-06, "loss": 0.0019, "num_tokens": 146641657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2088 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6963333333333334, "grad_norm": 1.463492549369505e-09, "kl": 0.04302978515625, "learning_rate": 5.121403408612672e-06, "loss": 0.0017, "num_tokens": 146717241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2089 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6966666666666667, "grad_norm": 1.8039523208557284e-09, "kl": 0.04620361328125, "learning_rate": 5.111249803786342e-06, "loss": 0.0019, "num_tokens": 146794537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2090 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.697, "grad_norm": 2.022987111161001e-09, "kl": 0.046142578125, "learning_rate": 5.101102817619132e-06, "loss": 0.0018, "num_tokens": 146871529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2091 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6973333333333334, "grad_norm": 1.6709652550517262e-09, "kl": 0.0401611328125, "learning_rate": 5.090962463848592e-06, "loss": 0.0016, "num_tokens": 146948457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2092 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6976666666666667, "grad_norm": 1.714156594445626e-09, "kl": 0.0465087890625, "learning_rate": 5.080828756203294e-06, "loss": 0.0019, "num_tokens": 147023577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2093 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.698, "grad_norm": 9.676069945285803e-10, "kl": 0.04559326171875, "learning_rate": 5.070701708402812e-06, "loss": 0.0018, "num_tokens": 147097897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2094 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6983333333333334, "grad_norm": 1.453238973603277e-09, "kl": 0.04852294921875, "learning_rate": 5.060581334157693e-06, "loss": 0.0019, "num_tokens": 147173257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2095 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6986666666666667, "grad_norm": 2.0685952950572073e-09, "kl": 0.04791259765625, "learning_rate": 5.05046764716946e-06, "loss": 0.0019, "num_tokens": 147249721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2096 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.699, "grad_norm": 1.3517864605461227e-09, "kl": 0.04638671875, "learning_rate": 5.04036066113058e-06, "loss": 0.0019, "num_tokens": 147323913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2097 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6993333333333334, "grad_norm": 2.0163619662838528e-09, "kl": 0.0394287109375, "learning_rate": 5.030260389724447e-06, "loss": 0.0016, "num_tokens": 147398553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2098 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6996666666666667, "grad_norm": 1.2988320419182742e-09, "kl": 0.04437255859375, "learning_rate": 5.020166846625365e-06, "loss": 0.0018, "num_tokens": 147473385.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2099 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7, "grad_norm": 1.288446682679023e-09, "kl": 0.04736328125, "learning_rate": 5.01008004549853e-06, "loss": 0.0019, "num_tokens": 147548361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7003333333333334, "grad_norm": 1.965429596850754e-09, "kl": 0.04327392578125, "learning_rate": 5.000000000000003e-06, "loss": 0.0017, "num_tokens": 147627929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7006666666666667, "grad_norm": 1.3585821356798533e-09, "kl": 0.04840087890625, "learning_rate": 4.989926723776707e-06, "loss": 0.0019, "num_tokens": 147704937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.701, "grad_norm": 3.060430797674485e-09, "kl": 0.0439453125, "learning_rate": 4.979860230466398e-06, "loss": 0.0018, "num_tokens": 147783881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7013333333333334, "grad_norm": 2.4823232358528458e-09, "kl": 0.04034423828125, "learning_rate": 4.96980053369765e-06, "loss": 0.0016, "num_tokens": 147867241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7016666666666667, "grad_norm": 1.5334354896978653e-09, "kl": 0.04876708984375, "learning_rate": 4.959747647089833e-06, "loss": 0.0019, "num_tokens": 147942585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.702, "grad_norm": 1.142280492416603e-09, "kl": 0.04534912109375, "learning_rate": 4.949701584253103e-06, "loss": 0.0018, "num_tokens": 148021193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7023333333333334, "grad_norm": 1.3960120837097634e-09, "kl": 0.0440673828125, "learning_rate": 4.939662358788364e-06, "loss": 0.0018, "num_tokens": 148095577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7026666666666667, "grad_norm": 1.0086537161058118e-09, "kl": 0.043212890625, "learning_rate": 4.929629984287278e-06, "loss": 0.0017, "num_tokens": 148169769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.703, "grad_norm": 1.8072595642237843e-09, "kl": 0.04205322265625, "learning_rate": 4.919604474332223e-06, "loss": 0.0017, "num_tokens": 148246585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7033333333333334, "grad_norm": 1.6154312332261611e-09, "kl": 0.04547119140625, "learning_rate": 4.909585842496287e-06, "loss": 0.0018, "num_tokens": 148322633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7036666666666667, "grad_norm": 1.4079877264094875e-09, "kl": 0.0430908203125, "learning_rate": 4.899574102343247e-06, "loss": 0.0017, "num_tokens": 148397705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.704, "grad_norm": 9.878139417551779e-10, "kl": 0.04718017578125, "learning_rate": 4.889569267427548e-06, "loss": 0.0019, "num_tokens": 148472697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7043333333333334, "grad_norm": 1.757420431403034e-09, "kl": 0.04595947265625, "learning_rate": 4.879571351294287e-06, "loss": 0.0018, "num_tokens": 148546697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7046666666666667, "grad_norm": 1.940094529473413e-09, "kl": 0.04449462890625, "learning_rate": 4.869580367479187e-06, "loss": 0.0018, "num_tokens": 148629001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.705, "grad_norm": 1.536891280906616e-09, "kl": 0.04962158203125, "learning_rate": 4.859596329508598e-06, "loss": 0.002, "num_tokens": 148705049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7053333333333334, "grad_norm": 1.389648507377217e-09, "kl": 0.04736328125, "learning_rate": 4.849619250899458e-06, "loss": 0.0019, "num_tokens": 148782457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7056666666666667, "grad_norm": 1.1227275775738121e-09, "kl": 0.0474853515625, "learning_rate": 4.8396491451592855e-06, "loss": 0.0019, "num_tokens": 148856649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.706, "grad_norm": 2.2867983062724306e-09, "kl": 0.04827880859375, "learning_rate": 4.8296860257861585e-06, "loss": 0.0019, "num_tokens": 148933225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7063333333333334, "grad_norm": 2.3825512673880667e-09, "kl": 0.044189453125, "learning_rate": 4.8197299062687e-06, "loss": 0.0018, "num_tokens": 149014889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7066666666666667, "grad_norm": 1.2605755328465307e-09, "kl": 0.046875, "learning_rate": 4.809780800086046e-06, "loss": 0.0019, "num_tokens": 149090249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.707, "grad_norm": 1.584815279009888e-09, "kl": 0.05072021484375, "learning_rate": 4.799838720707847e-06, "loss": 0.002, "num_tokens": 149166281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7073333333333334, "grad_norm": 1.2099057311587558e-09, "kl": 0.0447998046875, "learning_rate": 4.78990368159424e-06, "loss": 0.0018, "num_tokens": 149241017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7076666666666667, "grad_norm": 1.5133145847556762e-09, "kl": 0.044921875, "learning_rate": 4.7799756961958195e-06, "loss": 0.0018, "num_tokens": 149317801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.708, "grad_norm": 1.2313777775219137e-09, "kl": 0.04345703125, "learning_rate": 4.770054777953647e-06, "loss": 0.0017, "num_tokens": 149392745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7083333333333334, "grad_norm": 1.6607938357893204e-09, "kl": 0.04449462890625, "learning_rate": 4.76014094029921e-06, "loss": 0.0018, "num_tokens": 149467033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7086666666666667, "grad_norm": 1.5748307102825265e-09, "kl": 0.0469970703125, "learning_rate": 4.7502341966544e-06, "loss": 0.0019, "num_tokens": 149542585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.709, "grad_norm": 1.1141104705458815e-09, "kl": 0.04754638671875, "learning_rate": 4.7403345604315135e-06, "loss": 0.0019, "num_tokens": 149617705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7093333333333334, "grad_norm": 1.163764307143822e-09, "kl": 0.04827880859375, "learning_rate": 4.7304420450332244e-06, "loss": 0.0019, "num_tokens": 149690121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7096666666666667, "grad_norm": 1.996229403999905e-09, "kl": 0.04461669921875, "learning_rate": 4.720556663852569e-06, "loss": 0.0018, "num_tokens": 149764761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.71, "grad_norm": 1.041037589466498e-09, "kl": 0.04473876953125, "learning_rate": 4.710678430272907e-06, "loss": 0.0018, "num_tokens": 149838857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7103333333333334, "grad_norm": 1.8411533408979608e-09, "kl": 0.04400634765625, "learning_rate": 4.700807357667953e-06, "loss": 0.0018, "num_tokens": 149917577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7106666666666667, "grad_norm": 1.5097793015783623e-09, "kl": 0.0452880859375, "learning_rate": 4.690943459401693e-06, "loss": 0.0018, "num_tokens": 149993753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.711, "grad_norm": 1.459474985310294e-09, "kl": 0.04681396484375, "learning_rate": 4.681086748828424e-06, "loss": 0.0019, "num_tokens": 150069993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7113333333333334, "grad_norm": 2.696329159945776e-09, "kl": 0.0457763671875, "learning_rate": 4.671237239292699e-06, "loss": 0.0018, "num_tokens": 150147881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7116666666666667, "grad_norm": 2.3898489853735327e-09, "kl": 0.04254150390625, "learning_rate": 4.661394944129334e-06, "loss": 0.0017, "num_tokens": 150223785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.712, "grad_norm": 2.07627381954012e-09, "kl": 0.04620361328125, "learning_rate": 4.65155987666336e-06, "loss": 0.0018, "num_tokens": 150299689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7123333333333334, "grad_norm": 1.6406819236536307e-09, "kl": 0.04632568359375, "learning_rate": 4.641732050210032e-06, "loss": 0.0019, "num_tokens": 150375081.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7126666666666667, "grad_norm": 2.9409696900017934e-09, "kl": 0.04730224609375, "learning_rate": 4.631911478074815e-06, "loss": 0.0019, "num_tokens": 150450585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.713, "grad_norm": 1.2244349978374203e-09, "kl": 0.04931640625, "learning_rate": 4.622098173553329e-06, "loss": 0.002, "num_tokens": 150526073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7133333333333334, "grad_norm": 1.415452866027067e-09, "kl": 0.0465087890625, "learning_rate": 4.612292149931369e-06, "loss": 0.0019, "num_tokens": 150600985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7136666666666667, "grad_norm": 1.3033741863566206e-09, "kl": 0.0447998046875, "learning_rate": 4.6024934204848745e-06, "loss": 0.0018, "num_tokens": 150676121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.714, "grad_norm": 2.133075716059807e-09, "kl": 0.0462646484375, "learning_rate": 4.592701998479896e-06, "loss": 0.0018, "num_tokens": 150752281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7143333333333334, "grad_norm": 3.084403843445216e-09, "kl": 0.04852294921875, "learning_rate": 4.582917897172603e-06, "loss": 0.0019, "num_tokens": 150833913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7146666666666667, "grad_norm": 2.0388870591858677e-09, "kl": 0.04669189453125, "learning_rate": 4.573141129809252e-06, "loss": 0.0019, "num_tokens": 150908921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.715, "grad_norm": 1.781347402918243e-09, "kl": 0.04766845703125, "learning_rate": 4.563371709626167e-06, "loss": 0.0019, "num_tokens": 150985177.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7153333333333334, "grad_norm": 1.2571244045744834e-09, "kl": 0.04083251953125, "learning_rate": 4.5536096498497295e-06, "loss": 0.0016, "num_tokens": 151060057.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7156666666666667, "grad_norm": 1.179564002029565e-09, "kl": 0.0494384765625, "learning_rate": 4.5438549636963534e-06, "loss": 0.002, "num_tokens": 151136313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.716, "grad_norm": 1.0286048679475357e-09, "kl": 0.04669189453125, "learning_rate": 4.534107664372466e-06, "loss": 0.0019, "num_tokens": 151210281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7163333333333334, "grad_norm": 1.909217450801748e-09, "kl": 0.04693603515625, "learning_rate": 4.524367765074499e-06, "loss": 0.0019, "num_tokens": 151286841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7166666666666667, "grad_norm": 2.408347743454442e-09, "kl": 0.0455322265625, "learning_rate": 4.514635278988866e-06, "loss": 0.0018, "num_tokens": 151367913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.717, "grad_norm": 2.5759745447828664e-09, "kl": 0.0472412109375, "learning_rate": 4.504910219291941e-06, "loss": 0.0019, "num_tokens": 151445033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7173333333333334, "grad_norm": 2.2820960676739332e-09, "kl": 0.04449462890625, "learning_rate": 4.495192599150045e-06, "loss": 0.0018, "num_tokens": 151527017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7176666666666667, "grad_norm": 1.5311073520152263e-09, "kl": 0.04119873046875, "learning_rate": 4.4854824317194266e-06, "loss": 0.0016, "num_tokens": 151602889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.718, "grad_norm": 1.3136572940553037e-09, "kl": 0.04449462890625, "learning_rate": 4.475779730146252e-06, "loss": 0.0018, "num_tokens": 151677881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7183333333333334, "grad_norm": 2.50921838862439e-09, "kl": 0.04840087890625, "learning_rate": 4.46608450756656e-06, "loss": 0.0019, "num_tokens": 151753161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7186666666666667, "grad_norm": 1.3053715886002237e-09, "kl": 0.04107666015625, "learning_rate": 4.4563967771062856e-06, "loss": 0.0016, "num_tokens": 151828425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.719, "grad_norm": 1.403503424590724e-09, "kl": 0.04791259765625, "learning_rate": 4.446716551881213e-06, "loss": 0.0019, "num_tokens": 151902697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7193333333333334, "grad_norm": 1.8047752181615806e-09, "kl": 0.0455322265625, "learning_rate": 4.437043844996952e-06, "loss": 0.0018, "num_tokens": 151980713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7196666666666667, "grad_norm": 3.648837010317152e-09, "kl": 0.04718017578125, "learning_rate": 4.427378669548958e-06, "loss": 0.0019, "num_tokens": 152060745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.72, "grad_norm": 1.6642344169426337e-09, "kl": 0.05120849609375, "learning_rate": 4.417721038622476e-06, "loss": 0.002, "num_tokens": 152137721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7203333333333334, "grad_norm": 1.6533822089215278e-09, "kl": 0.04241943359375, "learning_rate": 4.408070965292534e-06, "loss": 0.0017, "num_tokens": 152212841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7206666666666667, "grad_norm": 1.6148593573461767e-09, "kl": 0.0435791015625, "learning_rate": 4.398428462623932e-06, "loss": 0.0017, "num_tokens": 152288137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.721, "grad_norm": 1.7710726218922446e-09, "kl": 0.0526123046875, "learning_rate": 4.388793543671225e-06, "loss": 0.0021, "num_tokens": 152363241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7213333333333334, "grad_norm": 2.2588511061627514e-09, "kl": 0.0469970703125, "learning_rate": 4.379166221478697e-06, "loss": 0.0019, "num_tokens": 152439721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7216666666666667, "grad_norm": 1.7437623567317928e-09, "kl": 0.048583984375, "learning_rate": 4.369546509080338e-06, "loss": 0.0019, "num_tokens": 152513753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.722, "grad_norm": 1.8338061069655964e-09, "kl": 0.0426025390625, "learning_rate": 4.359934419499859e-06, "loss": 0.0017, "num_tokens": 152591145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7223333333333334, "grad_norm": 1.3991672265234456e-09, "kl": 0.04705810546875, "learning_rate": 4.350329965750622e-06, "loss": 0.0019, "num_tokens": 152665849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7226666666666667, "grad_norm": 1.6801825486467692e-09, "kl": 0.0447998046875, "learning_rate": 4.3407331608356715e-06, "loss": 0.0018, "num_tokens": 152741673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.723, "grad_norm": 1.5790622143185828e-09, "kl": 0.04547119140625, "learning_rate": 4.33114401774769e-06, "loss": 0.0018, "num_tokens": 152817337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7233333333333334, "grad_norm": 2.4515320884432867e-09, "kl": 0.04583740234375, "learning_rate": 4.321562549468991e-06, "loss": 0.0018, "num_tokens": 152894953.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7236666666666667, "grad_norm": 1.5095218408589517e-09, "kl": 0.04205322265625, "learning_rate": 4.311988768971484e-06, "loss": 0.0017, "num_tokens": 152969737.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.724, "grad_norm": 1.595224619066471e-09, "kl": 0.04669189453125, "learning_rate": 4.302422689216684e-06, "loss": 0.0019, "num_tokens": 153046505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7243333333333334, "grad_norm": 1.006166483463744e-09, "kl": 0.04779052734375, "learning_rate": 4.292864323155684e-06, "loss": 0.0019, "num_tokens": 153121145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7246666666666667, "grad_norm": 1.7706424104702023e-09, "kl": 0.0487060546875, "learning_rate": 4.2833136837291165e-06, "loss": 0.0019, "num_tokens": 153195033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.725, "grad_norm": 1.425219608996997e-09, "kl": 0.045166015625, "learning_rate": 4.273770783867167e-06, "loss": 0.0018, "num_tokens": 153269817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7253333333333334, "grad_norm": 2.988301606166033e-09, "kl": 0.04571533203125, "learning_rate": 4.264235636489542e-06, "loss": 0.0018, "num_tokens": 153346185.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7256666666666667, "grad_norm": 1.2484224765074714e-09, "kl": 0.04498291015625, "learning_rate": 4.25470825450544e-06, "loss": 0.0018, "num_tokens": 153421225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.726, "grad_norm": 1.2475552813029367e-09, "kl": 0.04345703125, "learning_rate": 4.245188650813559e-06, "loss": 0.0017, "num_tokens": 153500969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7263333333333334, "grad_norm": 1.612631139735754e-09, "kl": 0.04486083984375, "learning_rate": 4.235676838302069e-06, "loss": 0.0018, "num_tokens": 153576329.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7266666666666667, "grad_norm": 1.6703100014225924e-09, "kl": 0.04779052734375, "learning_rate": 4.226172829848576e-06, "loss": 0.0019, "num_tokens": 153652393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.727, "grad_norm": 1.538385752120064e-09, "kl": 0.04876708984375, "learning_rate": 4.216676638320135e-06, "loss": 0.0019, "num_tokens": 153727385.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7273333333333334, "grad_norm": 1.7500321192187585e-09, "kl": 0.04718017578125, "learning_rate": 4.207188276573214e-06, "loss": 0.0019, "num_tokens": 153805081.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7276666666666667, "grad_norm": 1.5101997430377878e-09, "kl": 0.04705810546875, "learning_rate": 4.197707757453675e-06, "loss": 0.0019, "num_tokens": 153879801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.728, "grad_norm": 1.2000829663350032e-08, "kl": 0.0457763671875, "learning_rate": 4.188235093796768e-06, "loss": 0.0018, "num_tokens": 153962745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7283333333333334, "grad_norm": 1.7656077710981322e-09, "kl": 0.0460205078125, "learning_rate": 4.178770298427107e-06, "loss": 0.0018, "num_tokens": 154037577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7286666666666667, "grad_norm": 1.1018516099525755e-09, "kl": 0.04669189453125, "learning_rate": 4.169313384158653e-06, "loss": 0.0019, "num_tokens": 154111545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.729, "grad_norm": 1.6076042719248562e-09, "kl": 0.0478515625, "learning_rate": 4.1598643637946975e-06, "loss": 0.0019, "num_tokens": 154186937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7293333333333333, "grad_norm": 2.422549494340842e-09, "kl": 0.04388427734375, "learning_rate": 4.150423250127846e-06, "loss": 0.0018, "num_tokens": 154263385.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7296666666666667, "grad_norm": 2.1332566824128207e-09, "kl": 0.04473876953125, "learning_rate": 4.140990055939997e-06, "loss": 0.0018, "num_tokens": 154340809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.73, "grad_norm": 1.4593349861868887e-09, "kl": 0.0467529296875, "learning_rate": 4.131564794002324e-06, "loss": 0.0019, "num_tokens": 154414281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7303333333333333, "grad_norm": 1.8784636068858163e-09, "kl": 0.0439453125, "learning_rate": 4.12214747707527e-06, "loss": 0.0018, "num_tokens": 154490777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7306666666666667, "grad_norm": 1.466127441673848e-09, "kl": 0.04290771484375, "learning_rate": 4.1127381179085145e-06, "loss": 0.0017, "num_tokens": 154566505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.731, "grad_norm": 1.5561664179486456e-09, "kl": 0.04632568359375, "learning_rate": 4.103336729240967e-06, "loss": 0.0019, "num_tokens": 154640553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7313333333333333, "grad_norm": 1.3554396494086518e-09, "kl": 0.045166015625, "learning_rate": 4.093943323800746e-06, "loss": 0.0018, "num_tokens": 154716201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7316666666666667, "grad_norm": 2.0975106096443596e-09, "kl": 0.04559326171875, "learning_rate": 4.0845579143051625e-06, "loss": 0.0018, "num_tokens": 154791705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.732, "grad_norm": 1.335821675496618e-09, "kl": 0.04095458984375, "learning_rate": 4.075180513460695e-06, "loss": 0.0016, "num_tokens": 154866137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7323333333333333, "grad_norm": 1.5670132968992334e-09, "kl": 0.042236328125, "learning_rate": 4.065811133962987e-06, "loss": 0.0017, "num_tokens": 154941529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7326666666666667, "grad_norm": 1.7830900089776947e-09, "kl": 0.044921875, "learning_rate": 4.056449788496824e-06, "loss": 0.0018, "num_tokens": 155017337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.733, "grad_norm": 1.6873773489578525e-09, "kl": 0.0445556640625, "learning_rate": 4.047096489736102e-06, "loss": 0.0018, "num_tokens": 155093417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7333333333333333, "grad_norm": 1.994529652549204e-09, "kl": 0.04571533203125, "learning_rate": 4.037751250343841e-06, "loss": 0.0018, "num_tokens": 155168889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2200 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7336666666666667, "grad_norm": 1.3627780015568192e-09, "kl": 0.04937744140625, "learning_rate": 4.028414082972141e-06, "loss": 0.002, "num_tokens": 155242681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.734, "grad_norm": 1.4729567565652246e-09, "kl": 0.043701171875, "learning_rate": 4.019085000262164e-06, "loss": 0.0017, "num_tokens": 155318105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7343333333333333, "grad_norm": 1.828477591558908e-09, "kl": 0.04541015625, "learning_rate": 4.009764014844143e-06, "loss": 0.0018, "num_tokens": 155391433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7346666666666667, "grad_norm": 2.021040668154228e-09, "kl": 0.04583740234375, "learning_rate": 4.000451139337338e-06, "loss": 0.0018, "num_tokens": 155467161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.735, "grad_norm": 2.5734336883687092e-09, "kl": 0.04534912109375, "learning_rate": 3.9911463863500365e-06, "loss": 0.0018, "num_tokens": 155544313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7353333333333333, "grad_norm": 1.1503157315573276e-09, "kl": 0.0467529296875, "learning_rate": 3.981849768479516e-06, "loss": 0.0019, "num_tokens": 155620089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7356666666666667, "grad_norm": 1.616610290078313e-09, "kl": 0.04827880859375, "learning_rate": 3.972561298312063e-06, "loss": 0.0019, "num_tokens": 155696233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.736, "grad_norm": 2.1778323588961257e-09, "kl": 0.04852294921875, "learning_rate": 3.96328098842291e-06, "loss": 0.0019, "num_tokens": 155772857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7363333333333333, "grad_norm": 1.3141869814603524e-09, "kl": 0.041748046875, "learning_rate": 3.954008851376252e-06, "loss": 0.0017, "num_tokens": 155848377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7366666666666667, "grad_norm": 1.4419735405724055e-09, "kl": 0.04840087890625, "learning_rate": 3.944744899725221e-06, "loss": 0.0019, "num_tokens": 155923609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2210 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.737, "grad_norm": 1.4683484428346105e-09, "kl": 0.0452880859375, "learning_rate": 3.9354891460118695e-06, "loss": 0.0018, "num_tokens": 155998777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7373333333333333, "grad_norm": 1.9341483969981255e-09, "kl": 0.0418701171875, "learning_rate": 3.9262416027671354e-06, "loss": 0.0017, "num_tokens": 156075849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7376666666666667, "grad_norm": 2.080253524994191e-09, "kl": 0.0482177734375, "learning_rate": 3.917002282510854e-06, "loss": 0.0019, "num_tokens": 156151481.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.738, "grad_norm": 0.00037415779661387205, "kl": 0.046875, "learning_rate": 3.907771197751737e-06, "loss": 0.0019, "num_tokens": 156226617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2214 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7383333333333333, "grad_norm": 1.9200918632833464e-09, "kl": 0.043701171875, "learning_rate": 3.898548360987325e-06, "loss": 0.0017, "num_tokens": 156304889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7386666666666667, "grad_norm": 1.5810110998160098e-09, "kl": 0.04522705078125, "learning_rate": 3.889333784704003e-06, "loss": 0.0018, "num_tokens": 156378201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.739, "grad_norm": 1.56274360119113e-09, "kl": 0.04571533203125, "learning_rate": 3.880127481376975e-06, "loss": 0.0018, "num_tokens": 156453193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7393333333333333, "grad_norm": 1.0836347374976185e-09, "kl": 0.045166015625, "learning_rate": 3.8709294634702374e-06, "loss": 0.0018, "num_tokens": 156527657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7396666666666667, "grad_norm": 1.7463023249675302e-09, "kl": 0.0477294921875, "learning_rate": 3.861739743436575e-06, "loss": 0.0019, "num_tokens": 156605993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.74, "grad_norm": 2.3053099429404256e-09, "kl": 0.045654296875, "learning_rate": 3.852558333717536e-06, "loss": 0.0018, "num_tokens": 156691833.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2220 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7403333333333333, "grad_norm": 1.6341034081435168e-09, "kl": 0.0462646484375, "learning_rate": 3.8433852467434175e-06, "loss": 0.0019, "num_tokens": 156766553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7406666666666667, "grad_norm": 1.4812049364820723e-09, "kl": 0.04595947265625, "learning_rate": 3.834220494933252e-06, "loss": 0.0018, "num_tokens": 156842393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.741, "grad_norm": 1.509101621444131e-09, "kl": 0.046630859375, "learning_rate": 3.825064090694785e-06, "loss": 0.0019, "num_tokens": 156918153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7413333333333333, "grad_norm": 3.709704987642226e-09, "kl": 0.04644775390625, "learning_rate": 3.81591604642446e-06, "loss": 0.0019, "num_tokens": 156994569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7416666666666667, "grad_norm": 1.5297644262446397e-09, "kl": 0.04473876953125, "learning_rate": 3.8067763745074017e-06, "loss": 0.0018, "num_tokens": 157072681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.742, "grad_norm": 1.262082105490947e-09, "kl": 0.046630859375, "learning_rate": 3.797645087317401e-06, "loss": 0.0019, "num_tokens": 157145897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7423333333333333, "grad_norm": 1.6993195739445355e-09, "kl": 0.04669189453125, "learning_rate": 3.7885221972168974e-06, "loss": 0.0019, "num_tokens": 157219929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7426666666666667, "grad_norm": 1.5095448224755614e-09, "kl": 0.04541015625, "learning_rate": 3.779407716556962e-06, "loss": 0.0018, "num_tokens": 157298521.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2228 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.743, "grad_norm": 1.6611865216731303e-09, "kl": 0.0457763671875, "learning_rate": 3.77030165767728e-06, "loss": 0.0018, "num_tokens": 157371961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2229 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7433333333333333, "grad_norm": 1.2625971379520706e-09, "kl": 0.0478515625, "learning_rate": 3.7612040329061405e-06, "loss": 0.0019, "num_tokens": 157445657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2230 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7436666666666667, "grad_norm": 1.2067157273421003e-09, "kl": 0.04266357421875, "learning_rate": 3.7521148545604003e-06, "loss": 0.0017, "num_tokens": 157521177.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2231 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.744, "grad_norm": 8.376968563261755e-10, "kl": 0.0438232421875, "learning_rate": 3.7430341349454924e-06, "loss": 0.0018, "num_tokens": 157598345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7443333333333333, "grad_norm": 1.495280566032875e-09, "kl": 0.04571533203125, "learning_rate": 3.7339618863553983e-06, "loss": 0.0018, "num_tokens": 157674889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2233 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7446666666666667, "grad_norm": 1.8390252654043593e-09, "kl": 0.04840087890625, "learning_rate": 3.7248981210726186e-06, "loss": 0.0019, "num_tokens": 157751497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2234 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.745, "grad_norm": 1.4266673398211083e-09, "kl": 0.04388427734375, "learning_rate": 3.7158428513681876e-06, "loss": 0.0018, "num_tokens": 157825913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2235 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7453333333333333, "grad_norm": 1.625565237972637e-09, "kl": 0.04443359375, "learning_rate": 3.7067960895016277e-06, "loss": 0.0018, "num_tokens": 157899161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2236 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7456666666666667, "grad_norm": 1.4522918423409692e-09, "kl": 0.04644775390625, "learning_rate": 3.6977578477209352e-06, "loss": 0.0019, "num_tokens": 157975753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2237 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.746, "grad_norm": 2.4619606353581958e-09, "kl": 0.04443359375, "learning_rate": 3.6887281382625838e-06, "loss": 0.0018, "num_tokens": 158053209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2238 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7463333333333333, "grad_norm": 2.241760110877067e-09, "kl": 0.04345703125, "learning_rate": 3.679706973351491e-06, "loss": 0.0017, "num_tokens": 158131609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2239 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7466666666666667, "grad_norm": 1.5271621744972208e-09, "kl": 0.04425048828125, "learning_rate": 3.6706943652010073e-06, "loss": 0.0018, "num_tokens": 158206153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2240 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.747, "grad_norm": 2.35271602200271e-09, "kl": 0.042236328125, "learning_rate": 3.661690326012897e-06, "loss": 0.0017, "num_tokens": 158288633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2241 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7473333333333333, "grad_norm": 1.4481499333030001e-09, "kl": 0.046630859375, "learning_rate": 3.6526948679773256e-06, "loss": 0.0019, "num_tokens": 158364537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2242 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7476666666666667, "grad_norm": 1.6750054676606396e-09, "kl": 0.04803466796875, "learning_rate": 3.6437080032728355e-06, "loss": 0.0019, "num_tokens": 158439609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2243 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.748, "grad_norm": 1.4223997535367516e-09, "kl": 0.0439453125, "learning_rate": 3.634729744066341e-06, "loss": 0.0018, "num_tokens": 158514809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2244 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7483333333333333, "grad_norm": 1.1278374900669519e-09, "kl": 0.04345703125, "learning_rate": 3.625760102513103e-06, "loss": 0.0017, "num_tokens": 158589209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2245 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7486666666666667, "grad_norm": 1.9592740763130223e-09, "kl": 0.0450439453125, "learning_rate": 3.6167990907567207e-06, "loss": 0.0018, "num_tokens": 158667289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2246 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.749, "grad_norm": 1.509282587797145e-09, "kl": 0.050048828125, "learning_rate": 3.6078467209290936e-06, "loss": 0.002, "num_tokens": 158740137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2247 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7493333333333333, "grad_norm": 1.8624178865778163e-09, "kl": 0.0458984375, "learning_rate": 3.598903005150444e-06, "loss": 0.0018, "num_tokens": 158815113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2248 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7496666666666667, "grad_norm": 1.337460475703267e-09, "kl": 0.04266357421875, "learning_rate": 3.5899679555292654e-06, "loss": 0.0017, "num_tokens": 158890697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2249 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.75, "grad_norm": 1.4602919984341156e-09, "kl": 0.041259765625, "learning_rate": 3.5810415841623146e-06, "loss": 0.0017, "num_tokens": 158966825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7503333333333333, "grad_norm": 2.3775514890189697e-09, "kl": 0.04730224609375, "learning_rate": 3.5721239031346067e-06, "loss": 0.0019, "num_tokens": 159043097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2251 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7506666666666667, "grad_norm": 1.4219833088802147e-09, "kl": 0.044189453125, "learning_rate": 3.563214924519394e-06, "loss": 0.0018, "num_tokens": 159118777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2252 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.751, "grad_norm": 1.4575084472667754e-09, "kl": 0.04547119140625, "learning_rate": 3.554314660378133e-06, "loss": 0.0018, "num_tokens": 159193993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2253 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7513333333333333, "grad_norm": 1.7561726517456577e-09, "kl": 0.0458984375, "learning_rate": 3.545423122760493e-06, "loss": 0.0018, "num_tokens": 159268649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7516666666666667, "grad_norm": 1.185661235858504e-09, "kl": 0.045166015625, "learning_rate": 3.5365403237043373e-06, "loss": 0.0018, "num_tokens": 159343033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2255 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.752, "grad_norm": 1.4063439301992275e-09, "kl": 0.04339599609375, "learning_rate": 3.527666275235677e-06, "loss": 0.0017, "num_tokens": 159417033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2256 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7523333333333333, "grad_norm": 1.7714633093746102e-09, "kl": 0.0460205078125, "learning_rate": 3.5188009893686916e-06, "loss": 0.0018, "num_tokens": 159494425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2257 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7526666666666667, "grad_norm": 1.5727866786718891e-09, "kl": 0.040771484375, "learning_rate": 3.5099444781056956e-06, "loss": 0.0016, "num_tokens": 159571881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2258 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.753, "grad_norm": 1.9258610262085085e-09, "kl": 0.04815673828125, "learning_rate": 3.5010967534371167e-06, "loss": 0.0019, "num_tokens": 159647193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2259 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7533333333333333, "grad_norm": 1.6801326996329635e-09, "kl": 0.04266357421875, "learning_rate": 3.492257827341492e-06, "loss": 0.0017, "num_tokens": 159723369.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2260 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7536666666666667, "grad_norm": 2.1229209501427704e-09, "kl": 0.04486083984375, "learning_rate": 3.483427711785449e-06, "loss": 0.0018, "num_tokens": 159802473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2261 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.754, "grad_norm": 1.4113106239221906e-09, "kl": 0.04278564453125, "learning_rate": 3.474606418723683e-06, "loss": 0.0017, "num_tokens": 159877369.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2262 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7543333333333333, "grad_norm": 1.7367685067881666e-09, "kl": 0.04779052734375, "learning_rate": 3.4657939600989453e-06, "loss": 0.0019, "num_tokens": 159952393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2263 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7546666666666667, "grad_norm": 9.468510420163057e-10, "kl": 0.04681396484375, "learning_rate": 3.45699034784203e-06, "loss": 0.0019, "num_tokens": 160026921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2264 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.755, "grad_norm": 1.697812224144002e-09, "kl": 0.0482177734375, "learning_rate": 3.4481955938717514e-06, "loss": 0.0019, "num_tokens": 160102457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2265 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7553333333333333, "grad_norm": 1.3509019458624039e-09, "kl": 0.0478515625, "learning_rate": 3.4394097100949286e-06, "loss": 0.0019, "num_tokens": 160176153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2266 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7556666666666667, "grad_norm": 1.6573364902683352e-09, "kl": 0.04400634765625, "learning_rate": 3.4306327084063762e-06, "loss": 0.0018, "num_tokens": 160252457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2267 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.756, "grad_norm": 1.6228497434767064e-09, "kl": 0.0472412109375, "learning_rate": 3.4218646006888836e-06, "loss": 0.0019, "num_tokens": 160327673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2268 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7563333333333333, "grad_norm": 1.7708124966375749e-09, "kl": 0.04443359375, "learning_rate": 3.4131053988131947e-06, "loss": 0.0018, "num_tokens": 160405705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2269 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7566666666666667, "grad_norm": 1.0130717376100051e-09, "kl": 0.04345703125, "learning_rate": 3.4043551146380026e-06, "loss": 0.0017, "num_tokens": 160479033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2270 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.757, "grad_norm": 1.2933142334858871e-09, "kl": 0.04498291015625, "learning_rate": 3.3956137600099248e-06, "loss": 0.0018, "num_tokens": 160552425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2271 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7573333333333333, "grad_norm": 1.544075423076663e-09, "kl": 0.04736328125, "learning_rate": 3.3868813467634833e-06, "loss": 0.0019, "num_tokens": 160626761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2272 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7576666666666667, "grad_norm": 1.4945190640602846e-09, "kl": 0.048095703125, "learning_rate": 3.3781578867211016e-06, "loss": 0.0019, "num_tokens": 160702905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2273 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.758, "grad_norm": 1.3849410507305038e-09, "kl": 0.04541015625, "learning_rate": 3.3694433916930803e-06, "loss": 0.0018, "num_tokens": 160778617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2274 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7583333333333333, "grad_norm": 1.4927449276669336e-09, "kl": 0.0430908203125, "learning_rate": 3.360737873477584e-06, "loss": 0.0017, "num_tokens": 160854937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2275 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7586666666666667, "grad_norm": 1.297062901528534e-09, "kl": 0.04217529296875, "learning_rate": 3.3520413438606215e-06, "loss": 0.0017, "num_tokens": 160929801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2276 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.759, "grad_norm": 1.4698717798466987e-09, "kl": 0.048095703125, "learning_rate": 3.343353814616036e-06, "loss": 0.0019, "num_tokens": 161006521.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2277 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7593333333333333, "grad_norm": 1.4599065289999658e-09, "kl": 0.04058837890625, "learning_rate": 3.3346752975054763e-06, "loss": 0.0016, "num_tokens": 161080665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2278 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7596666666666667, "grad_norm": 1.3320082814516354e-09, "kl": 0.04168701171875, "learning_rate": 3.3260058042784014e-06, "loss": 0.0017, "num_tokens": 161155449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2279 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.76, "grad_norm": 1.2108786195952348e-09, "kl": 0.0477294921875, "learning_rate": 3.3173453466720473e-06, "loss": 0.0019, "num_tokens": 161229177.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2280 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7603333333333333, "grad_norm": 1.9602948153618627e-09, "kl": 0.04644775390625, "learning_rate": 3.308693936411421e-06, "loss": 0.0019, "num_tokens": 161304921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2281 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7606666666666667, "grad_norm": 2.8977975574662196e-09, "kl": 0.04522705078125, "learning_rate": 3.3000515852092684e-06, "loss": 0.0018, "num_tokens": 161381913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2282 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.761, "grad_norm": 1.3070662330250116e-09, "kl": 0.0399169921875, "learning_rate": 3.291418304766092e-06, "loss": 0.0016, "num_tokens": 161457289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2283 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7613333333333333, "grad_norm": 1.643856051281034e-09, "kl": 0.0478515625, "learning_rate": 3.2827941067700996e-06, "loss": 0.0019, "num_tokens": 161531529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2284 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7616666666666667, "grad_norm": 1.9552484076257315e-09, "kl": 0.049072265625, "learning_rate": 3.2741790028972e-06, "loss": 0.002, "num_tokens": 161606649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2285 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.762, "grad_norm": 1.3871428450329404e-09, "kl": 0.04248046875, "learning_rate": 3.265573004810997e-06, "loss": 0.0017, "num_tokens": 161682041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2286 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7623333333333333, "grad_norm": 1.2043264163708045e-09, "kl": 0.04296875, "learning_rate": 3.2569761241627694e-06, "loss": 0.0017, "num_tokens": 161756713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2287 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7626666666666667, "grad_norm": 9.494676156407422e-10, "kl": 0.0458984375, "learning_rate": 3.24838837259144e-06, "loss": 0.0018, "num_tokens": 161829897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2288 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.763, "grad_norm": 1.6194705576566548e-09, "kl": 0.0482177734375, "learning_rate": 3.239809761723579e-06, "loss": 0.0019, "num_tokens": 161906537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2289 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7633333333333333, "grad_norm": 2.926664910418708e-09, "kl": 0.04815673828125, "learning_rate": 3.2312403031733943e-06, "loss": 0.0019, "num_tokens": 161983929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2290 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7636666666666667, "grad_norm": 2.797790443764825e-09, "kl": 0.04498291015625, "learning_rate": 3.222680008542678e-06, "loss": 0.0018, "num_tokens": 162063881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2291 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.764, "grad_norm": 1.7914595362711339e-09, "kl": 0.04681396484375, "learning_rate": 3.2141288894208334e-06, "loss": 0.0019, "num_tokens": 162138409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2292 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7643333333333333, "grad_norm": 2.1897184065977626e-09, "kl": 0.04925537109375, "learning_rate": 3.2055869573848374e-06, "loss": 0.002, "num_tokens": 162211673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2293 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7646666666666667, "grad_norm": 2.216133276888854e-09, "kl": 0.04693603515625, "learning_rate": 3.1970542239992244e-06, "loss": 0.0019, "num_tokens": 162286921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2294 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.765, "grad_norm": 1.800288029762953e-09, "kl": 0.04779052734375, "learning_rate": 3.188530700816078e-06, "loss": 0.0019, "num_tokens": 162363097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2295 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7653333333333333, "grad_norm": 2.3931114867536962e-09, "kl": 0.04345703125, "learning_rate": 3.1800163993750166e-06, "loss": 0.0017, "num_tokens": 162438361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2296 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7656666666666667, "grad_norm": 1.9011319185580078e-09, "kl": 0.04498291015625, "learning_rate": 3.1715113312031674e-06, "loss": 0.0018, "num_tokens": 162514041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2297 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.766, "grad_norm": 1.766339630115965e-09, "kl": 0.04669189453125, "learning_rate": 3.1630155078151626e-06, "loss": 0.0019, "num_tokens": 162590473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2298 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7663333333333333, "grad_norm": 1.2543162064559965e-09, "kl": 0.04583740234375, "learning_rate": 3.1545289407131128e-06, "loss": 0.0018, "num_tokens": 162665433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2299 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7666666666666667, "grad_norm": 1.3188758973825543e-09, "kl": 0.04119873046875, "learning_rate": 3.146051641386605e-06, "loss": 0.0016, "num_tokens": 162741929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2300 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.767, "grad_norm": 2.019062028679741e-09, "kl": 0.04248046875, "learning_rate": 3.1375836213126653e-06, "loss": 0.0017, "num_tokens": 162820873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2301 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7673333333333333, "grad_norm": 1.2310045205410347e-09, "kl": 0.04547119140625, "learning_rate": 3.1291248919557717e-06, "loss": 0.0018, "num_tokens": 162896425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2302 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7676666666666667, "grad_norm": 1.3521155306506216e-09, "kl": 0.046142578125, "learning_rate": 3.1206754647678137e-06, "loss": 0.0018, "num_tokens": 162970121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2303 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.768, "grad_norm": 1.940174021441976e-09, "kl": 0.04833984375, "learning_rate": 3.1122353511880943e-06, "loss": 0.0019, "num_tokens": 163046377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7683333333333333, "grad_norm": 1.8324327610841351e-09, "kl": 0.04742431640625, "learning_rate": 3.103804562643302e-06, "loss": 0.0019, "num_tokens": 163120697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2305 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7686666666666667, "grad_norm": 1.0934843031051855e-09, "kl": 0.0465087890625, "learning_rate": 3.0953831105475064e-06, "loss": 0.0019, "num_tokens": 163195849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2306 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.769, "grad_norm": 1.4514314194968847e-09, "kl": 0.047119140625, "learning_rate": 3.086971006302125e-06, "loss": 0.0019, "num_tokens": 163270969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2307 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7693333333333333, "grad_norm": 1.7702331822633255e-09, "kl": 0.0458984375, "learning_rate": 3.0785682612959334e-06, "loss": 0.0018, "num_tokens": 163345817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2308 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7696666666666667, "grad_norm": 1.3495302653154795e-09, "kl": 0.04681396484375, "learning_rate": 3.0701748869050285e-06, "loss": 0.0019, "num_tokens": 163420313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2309 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.77, "grad_norm": 1.9393309180770757e-09, "kl": 0.0455322265625, "learning_rate": 3.0617908944928223e-06, "loss": 0.0018, "num_tokens": 163494665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2310 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7703333333333333, "grad_norm": 1.882447975276591e-09, "kl": 0.0418701171875, "learning_rate": 3.0534162954100264e-06, "loss": 0.0017, "num_tokens": 163570681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2311 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7706666666666667, "grad_norm": 1.077265610049949e-09, "kl": 0.04473876953125, "learning_rate": 3.0450511009946373e-06, "loss": 0.0018, "num_tokens": 163646169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2312 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.771, "grad_norm": 1.3503392848335238e-09, "kl": 0.040771484375, "learning_rate": 3.0366953225719076e-06, "loss": 0.0016, "num_tokens": 163722537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2313 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7713333333333333, "grad_norm": 1.962284779111201e-09, "kl": 0.04376220703125, "learning_rate": 3.028348971454356e-06, "loss": 0.0017, "num_tokens": 163798409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2314 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7716666666666666, "grad_norm": 1.641190849888119e-09, "kl": 0.04571533203125, "learning_rate": 3.0200120589417293e-06, "loss": 0.0018, "num_tokens": 163873417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2315 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.772, "grad_norm": 2.3541986138297943e-09, "kl": 0.04461669921875, "learning_rate": 3.0116845963209996e-06, "loss": 0.0018, "num_tokens": 163954057.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7723333333333333, "grad_norm": 2.1685944151528247e-09, "kl": 0.044677734375, "learning_rate": 3.003366594866345e-06, "loss": 0.0018, "num_tokens": 164029993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7726666666666666, "grad_norm": 1.3567931222979723e-09, "kl": 0.04339599609375, "learning_rate": 2.995058065839136e-06, "loss": 0.0017, "num_tokens": 164105609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2318 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.773, "grad_norm": 1.2452002762231018e-09, "kl": 0.04486083984375, "learning_rate": 2.9867590204879117e-06, "loss": 0.0018, "num_tokens": 164180921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7733333333333333, "grad_norm": 1.4944585569054425e-09, "kl": 0.04345703125, "learning_rate": 2.978469470048376e-06, "loss": 0.0017, "num_tokens": 164255977.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2320 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7736666666666666, "grad_norm": 1.7076284830608301e-09, "kl": 0.047119140625, "learning_rate": 2.970189425743383e-06, "loss": 0.0019, "num_tokens": 164330345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2321 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.774, "grad_norm": 1.4634683465075682e-09, "kl": 0.04388427734375, "learning_rate": 2.961918898782914e-06, "loss": 0.0018, "num_tokens": 164407049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7743333333333333, "grad_norm": 1.6869347030379345e-09, "kl": 0.0457763671875, "learning_rate": 2.953657900364053e-06, "loss": 0.0018, "num_tokens": 164482073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2323 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7746666666666666, "grad_norm": 1.0466174593659616e-09, "kl": 0.04376220703125, "learning_rate": 2.945406441671005e-06, "loss": 0.0018, "num_tokens": 164559065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2324 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.775, "grad_norm": 1.0007918938015337e-09, "kl": 0.0419921875, "learning_rate": 2.9371645338750477e-06, "loss": 0.0017, "num_tokens": 164634313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2325 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7753333333333333, "grad_norm": 2.378643948475201e-09, "kl": 0.04864501953125, "learning_rate": 2.9289321881345257e-06, "loss": 0.0019, "num_tokens": 164712217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2326 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7756666666666666, "grad_norm": 1.942564331613994e-09, "kl": 0.0478515625, "learning_rate": 2.9207094155948435e-06, "loss": 0.0019, "num_tokens": 164790921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2327 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.776, "grad_norm": 1.4715664242714865e-09, "kl": 0.04351806640625, "learning_rate": 2.912496227388446e-06, "loss": 0.0017, "num_tokens": 164866057.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2328 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7763333333333333, "grad_norm": 1.4215539856365922e-09, "kl": 0.04486083984375, "learning_rate": 2.9042926346347932e-06, "loss": 0.0018, "num_tokens": 164940665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2329 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7766666666666666, "grad_norm": 1.0158451857478212e-09, "kl": 0.043701171875, "learning_rate": 2.896098648440362e-06, "loss": 0.0017, "num_tokens": 165015929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2330 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.777, "grad_norm": 1.7840733335106052e-09, "kl": 0.0423583984375, "learning_rate": 2.8879142798986293e-06, "loss": 0.0017, "num_tokens": 165093049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2331 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7773333333333333, "grad_norm": 1.058192755643006e-09, "kl": 0.0423583984375, "learning_rate": 2.8797395400900362e-06, "loss": 0.0017, "num_tokens": 165168729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2332 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7776666666666666, "grad_norm": 1.3663588038781427e-09, "kl": 0.0498046875, "learning_rate": 2.8715744400819976e-06, "loss": 0.002, "num_tokens": 165244489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2333 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.778, "grad_norm": 1.1639131880514242e-09, "kl": 0.04754638671875, "learning_rate": 2.863418990928876e-06, "loss": 0.0019, "num_tokens": 165319865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2334 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7783333333333333, "grad_norm": 2.2448625180970794e-09, "kl": 0.04278564453125, "learning_rate": 2.855273203671969e-06, "loss": 0.0017, "num_tokens": 165396313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2335 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7786666666666666, "grad_norm": 1.9980463950020066e-09, "kl": 0.0447998046875, "learning_rate": 2.8471370893394866e-06, "loss": 0.0018, "num_tokens": 165474361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2336 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.779, "grad_norm": 1.5071405234934332e-09, "kl": 0.0447998046875, "learning_rate": 2.8390106589465514e-06, "loss": 0.0018, "num_tokens": 165546777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2337 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7793333333333333, "grad_norm": 1.938062599293744e-09, "kl": 0.046630859375, "learning_rate": 2.830893923495173e-06, "loss": 0.0019, "num_tokens": 165626217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2338 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7796666666666666, "grad_norm": 1.4758693156480263e-09, "kl": 0.04693603515625, "learning_rate": 2.8227868939742333e-06, "loss": 0.0019, "num_tokens": 165706105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2339 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.78, "grad_norm": 1.8107634280895013e-09, "kl": 0.043701171875, "learning_rate": 2.8146895813594754e-06, "loss": 0.0018, "num_tokens": 165781481.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2340 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7803333333333333, "grad_norm": 1.6678356473676104e-09, "kl": 0.04522705078125, "learning_rate": 2.8066019966134907e-06, "loss": 0.0018, "num_tokens": 165857641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2341 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7806666666666666, "grad_norm": 1.5116690121885767e-09, "kl": 0.04693603515625, "learning_rate": 2.79852415068569e-06, "loss": 0.0019, "num_tokens": 165932601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2342 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.781, "grad_norm": 2.192241277398921e-09, "kl": 0.046875, "learning_rate": 2.7904560545123082e-06, "loss": 0.0019, "num_tokens": 166011609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2343 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7813333333333333, "grad_norm": 1.1048826298321046e-09, "kl": 0.046875, "learning_rate": 2.7823977190163788e-06, "loss": 0.0019, "num_tokens": 166087257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2344 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7816666666666666, "grad_norm": 1.3459784398150987e-09, "kl": 0.047119140625, "learning_rate": 2.7743491551077197e-06, "loss": 0.0019, "num_tokens": 166162281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2345 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.782, "grad_norm": 1.7745145353131875e-09, "kl": 0.0438232421875, "learning_rate": 2.76631037368292e-06, "loss": 0.0018, "num_tokens": 166237513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2346 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7823333333333333, "grad_norm": 1.6565930849310462e-09, "kl": 0.0460205078125, "learning_rate": 2.7582813856253276e-06, "loss": 0.0018, "num_tokens": 166314089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2347 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7826666666666666, "grad_norm": 1.580381492338745e-09, "kl": 0.04852294921875, "learning_rate": 2.750262201805022e-06, "loss": 0.0019, "num_tokens": 166388905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2348 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.783, "grad_norm": 2.375486696237772e-09, "kl": 0.05133056640625, "learning_rate": 2.742252833078818e-06, "loss": 0.0021, "num_tokens": 166465465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2349 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7833333333333333, "grad_norm": 1.5615480020159112e-09, "kl": 0.05078125, "learning_rate": 2.7342532902902418e-06, "loss": 0.002, "num_tokens": 166541049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2350 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7836666666666666, "grad_norm": 1.1830009194468971e-09, "kl": 0.040771484375, "learning_rate": 2.726263584269513e-06, "loss": 0.0016, "num_tokens": 166616697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2351 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.784, "grad_norm": 9.003793821626971e-10, "kl": 0.05426025390625, "learning_rate": 2.718283725833537e-06, "loss": 0.0022, "num_tokens": 166691801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2352 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7843333333333333, "grad_norm": 1.589586351435912e-09, "kl": 0.04638671875, "learning_rate": 2.7103137257858867e-06, "loss": 0.0019, "num_tokens": 166765817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7846666666666666, "grad_norm": 1.4814635074245075e-09, "kl": 0.04248046875, "learning_rate": 2.7023535949167825e-06, "loss": 0.0017, "num_tokens": 166842025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2354 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.785, "grad_norm": 2.4501907169849346e-09, "kl": 0.04644775390625, "learning_rate": 2.6944033440030894e-06, "loss": 0.0019, "num_tokens": 166921817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7853333333333333, "grad_norm": 1.3121582709274549e-09, "kl": 0.04400634765625, "learning_rate": 2.6864629838082957e-06, "loss": 0.0018, "num_tokens": 166996009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2356 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7856666666666666, "grad_norm": 1.3741502380426596e-09, "kl": 0.045166015625, "learning_rate": 2.678532525082498e-06, "loss": 0.0018, "num_tokens": 167071689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2357 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.786, "grad_norm": 1.5541981035482877e-09, "kl": 0.0478515625, "learning_rate": 2.670611978562386e-06, "loss": 0.0019, "num_tokens": 167148217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2358 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7863333333333333, "grad_norm": 1.3386144415150625e-09, "kl": 0.04669189453125, "learning_rate": 2.6627013549712355e-06, "loss": 0.0019, "num_tokens": 167223721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2359 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7866666666666666, "grad_norm": 1.6004723102369667e-09, "kl": 0.04437255859375, "learning_rate": 2.654800665018884e-06, "loss": 0.0018, "num_tokens": 167298889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2360 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.787, "grad_norm": 1.6842296446384353e-09, "kl": 0.0457763671875, "learning_rate": 2.6469099194017144e-06, "loss": 0.0018, "num_tokens": 167376121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2361 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7873333333333333, "grad_norm": 9.208410145511436e-10, "kl": 0.04443359375, "learning_rate": 2.639029128802657e-06, "loss": 0.0018, "num_tokens": 167450233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2362 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7876666666666666, "grad_norm": 1.051167486387783e-09, "kl": 0.04791259765625, "learning_rate": 2.6311583038911625e-06, "loss": 0.0019, "num_tokens": 167524985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2363 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.788, "grad_norm": 2.5571273987168297e-09, "kl": 0.04632568359375, "learning_rate": 2.623297455323177e-06, "loss": 0.0018, "num_tokens": 167601401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2364 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7883333333333333, "grad_norm": 2.6806503683474148e-09, "kl": 0.045654296875, "learning_rate": 2.615446593741161e-06, "loss": 0.0018, "num_tokens": 167679097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2365 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7886666666666666, "grad_norm": 1.7121318807156172e-09, "kl": 0.05291748046875, "learning_rate": 2.607605729774041e-06, "loss": 0.0021, "num_tokens": 167754297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2366 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.789, "grad_norm": 1.2956629102944817e-09, "kl": 0.0440673828125, "learning_rate": 2.5997748740372053e-06, "loss": 0.0018, "num_tokens": 167830233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2367 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7893333333333333, "grad_norm": 2.3688029315849235e-09, "kl": 0.04931640625, "learning_rate": 2.5919540371325005e-06, "loss": 0.002, "num_tokens": 167907673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2368 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7896666666666666, "grad_norm": 9.83351400307697e-10, "kl": 0.04486083984375, "learning_rate": 2.584143229648207e-06, "loss": 0.0018, "num_tokens": 167983145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.79, "grad_norm": 1.5275106735046506e-09, "kl": 0.043701171875, "learning_rate": 2.576342462159024e-06, "loss": 0.0017, "num_tokens": 168057593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2370 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7903333333333333, "grad_norm": 1.900793966669312e-09, "kl": 0.0477294921875, "learning_rate": 2.5685517452260566e-06, "loss": 0.0019, "num_tokens": 168134409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2371 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7906666666666666, "grad_norm": 2.0142010281887224e-09, "kl": 0.04229736328125, "learning_rate": 2.5607710893968165e-06, "loss": 0.0017, "num_tokens": 168213913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2372 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.791, "grad_norm": 1.7203960478440194e-09, "kl": 0.0479736328125, "learning_rate": 2.5530005052051742e-06, "loss": 0.0019, "num_tokens": 168289689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2373 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7913333333333333, "grad_norm": 2.5012092397247443e-09, "kl": 0.05194091796875, "learning_rate": 2.5452400031713786e-06, "loss": 0.0021, "num_tokens": 168366169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2374 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7916666666666666, "grad_norm": 2.4596449321734326e-09, "kl": 0.04766845703125, "learning_rate": 2.5374895938020226e-06, "loss": 0.0019, "num_tokens": 168446025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2375 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.792, "grad_norm": 1.0837808428476592e-09, "kl": 0.044921875, "learning_rate": 2.529749287590042e-06, "loss": 0.0018, "num_tokens": 168520761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2376 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7923333333333333, "grad_norm": 1.77758718855614e-09, "kl": 0.0482177734375, "learning_rate": 2.522019095014683e-06, "loss": 0.0019, "num_tokens": 168595337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2377 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7926666666666666, "grad_norm": 2.5617037380243346e-09, "kl": 0.0482177734375, "learning_rate": 2.514299026541508e-06, "loss": 0.0019, "num_tokens": 168672425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2378 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.793, "grad_norm": 1.6600208985195763e-09, "kl": 0.04388427734375, "learning_rate": 2.506589092622371e-06, "loss": 0.0018, "num_tokens": 168748361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2379 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7933333333333333, "grad_norm": 1.4085200783497953e-09, "kl": 0.042724609375, "learning_rate": 2.4988893036954045e-06, "loss": 0.0017, "num_tokens": 168823993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2380 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7936666666666666, "grad_norm": 2.6387643181635667e-09, "kl": 0.0457763671875, "learning_rate": 2.4911996701850083e-06, "loss": 0.0018, "num_tokens": 168900089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2381 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.794, "grad_norm": 1.388531623014444e-09, "kl": 0.04876708984375, "learning_rate": 2.4835202025018325e-06, "loss": 0.002, "num_tokens": 168974969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2382 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7943333333333333, "grad_norm": 1.4662665526188334e-09, "kl": 0.048095703125, "learning_rate": 2.4758509110427576e-06, "loss": 0.0019, "num_tokens": 169049641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2383 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7946666666666666, "grad_norm": 1.5331171887567052e-09, "kl": 0.04803466796875, "learning_rate": 2.468191806190897e-06, "loss": 0.0019, "num_tokens": 169124025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2384 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.795, "grad_norm": 1.3462297943078738e-09, "kl": 0.0418701171875, "learning_rate": 2.4605428983155667e-06, "loss": 0.0017, "num_tokens": 169201065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2385 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7953333333333333, "grad_norm": 1.4274281756598839e-09, "kl": 0.04583740234375, "learning_rate": 2.45290419777228e-06, "loss": 0.0018, "num_tokens": 169277545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2386 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7956666666666666, "grad_norm": 2.1397907890019496e-09, "kl": 0.046142578125, "learning_rate": 2.4452757149027308e-06, "loss": 0.0018, "num_tokens": 169349929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2387 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.796, "grad_norm": 1.7044946565292207e-09, "kl": 0.0472412109375, "learning_rate": 2.4376574600347803e-06, "loss": 0.0019, "num_tokens": 169427225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2388 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7963333333333333, "grad_norm": 1.203796506921151e-09, "kl": 0.044921875, "learning_rate": 2.4300494434824373e-06, "loss": 0.0018, "num_tokens": 169502233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2389 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7966666666666666, "grad_norm": 1.950270611672522e-09, "kl": 0.04608154296875, "learning_rate": 2.422451675545855e-06, "loss": 0.0018, "num_tokens": 169578025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2390 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.797, "grad_norm": 1.900799073695225e-09, "kl": 0.04547119140625, "learning_rate": 2.4148641665113116e-06, "loss": 0.0018, "num_tokens": 169658729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2391 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7973333333333333, "grad_norm": 1.4691889926865542e-09, "kl": 0.045166015625, "learning_rate": 2.407286926651192e-06, "loss": 0.0018, "num_tokens": 169733769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2392 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7976666666666666, "grad_norm": 1.5835007749487318e-09, "kl": 0.04644775390625, "learning_rate": 2.3997199662239825e-06, "loss": 0.0019, "num_tokens": 169809785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2393 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.798, "grad_norm": 2.099869611527083e-09, "kl": 0.04669189453125, "learning_rate": 2.39216329547425e-06, "loss": 0.0019, "num_tokens": 169887673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2394 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7983333333333333, "grad_norm": 1.0801550764938384e-09, "kl": 0.04376220703125, "learning_rate": 2.3846169246326345e-06, "loss": 0.0018, "num_tokens": 169963209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2395 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7986666666666666, "grad_norm": 1.4090657529663986e-09, "kl": 0.04681396484375, "learning_rate": 2.3770808639158216e-06, "loss": 0.0019, "num_tokens": 170037609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2396 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.799, "grad_norm": 1.9750074908841952e-09, "kl": 0.0457763671875, "learning_rate": 2.3695551235265492e-06, "loss": 0.0018, "num_tokens": 170113977.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2397 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7993333333333333, "grad_norm": 8.834233899968069e-10, "kl": 0.04327392578125, "learning_rate": 2.362039713653581e-06, "loss": 0.0017, "num_tokens": 170190009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2398 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7996666666666666, "grad_norm": 2.0394412825197605e-09, "kl": 0.0478515625, "learning_rate": 2.3545346444716842e-06, "loss": 0.0019, "num_tokens": 170267177.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2399 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8, "grad_norm": 3.6643641454503495e-09, "kl": 0.0443115234375, "learning_rate": 2.347039926141644e-06, "loss": 0.0018, "num_tokens": 170342633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2400 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8003333333333333, "grad_norm": 2.8849962419030817e-09, "kl": 0.04779052734375, "learning_rate": 2.339555568810221e-06, "loss": 0.0019, "num_tokens": 170420201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2401 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8006666666666666, "grad_norm": 1.7071533076062906e-09, "kl": 0.0509033203125, "learning_rate": 2.332081582610146e-06, "loss": 0.002, "num_tokens": 170497865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2402 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.801, "grad_norm": 2.214656458221498e-09, "kl": 0.044921875, "learning_rate": 2.324617977660114e-06, "loss": 0.0018, "num_tokens": 170573561.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2403 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8013333333333333, "grad_norm": 1.0804485084392468e-09, "kl": 0.044677734375, "learning_rate": 2.317164764064769e-06, "loss": 0.0018, "num_tokens": 170649593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2404 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8016666666666666, "grad_norm": 1.6635335331471879e-09, "kl": 0.043212890625, "learning_rate": 2.309721951914675e-06, "loss": 0.0017, "num_tokens": 170724281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2405 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.802, "grad_norm": 1.2555982920048336e-09, "kl": 0.04559326171875, "learning_rate": 2.3022895512863207e-06, "loss": 0.0018, "num_tokens": 170799065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2406 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8023333333333333, "grad_norm": 1.9337043077882754e-09, "kl": 0.043701171875, "learning_rate": 2.2948675722421086e-06, "loss": 0.0017, "num_tokens": 170880457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2407 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8026666666666666, "grad_norm": 1.6985787221202031e-09, "kl": 0.0426025390625, "learning_rate": 2.2874560248303136e-06, "loss": 0.0017, "num_tokens": 170960617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2408 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.803, "grad_norm": 1.3114532793068179e-09, "kl": 0.04541015625, "learning_rate": 2.2800549190850997e-06, "loss": 0.0018, "num_tokens": 171035641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2409 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8033333333333333, "grad_norm": 2.086041117621562e-09, "kl": 0.04412841796875, "learning_rate": 2.27266426502649e-06, "loss": 0.0018, "num_tokens": 171110857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2410 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8036666666666666, "grad_norm": 1.9857901989439597e-09, "kl": 0.04803466796875, "learning_rate": 2.265284072660362e-06, "loss": 0.0019, "num_tokens": 171187833.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2411 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.804, "grad_norm": 1.628396750774641e-09, "kl": 0.04803466796875, "learning_rate": 2.257914351978422e-06, "loss": 0.0019, "num_tokens": 171262921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2412 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8043333333333333, "grad_norm": 1.436234797758118e-09, "kl": 0.04364013671875, "learning_rate": 2.2505551129582047e-06, "loss": 0.0017, "num_tokens": 171336969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2413 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8046666666666666, "grad_norm": 1.533853821733544e-09, "kl": 0.04498291015625, "learning_rate": 2.2432063655630555e-06, "loss": 0.0018, "num_tokens": 171413129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2414 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.805, "grad_norm": 1.2900661650050438e-09, "kl": 0.05133056640625, "learning_rate": 2.2358681197421094e-06, "loss": 0.0021, "num_tokens": 171489225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2415 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8053333333333333, "grad_norm": 1.3863969972049972e-09, "kl": 0.04571533203125, "learning_rate": 2.2285403854302912e-06, "loss": 0.0018, "num_tokens": 171562409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2416 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8056666666666666, "grad_norm": 2.0053567695299535e-09, "kl": 0.04522705078125, "learning_rate": 2.2212231725482914e-06, "loss": 0.0018, "num_tokens": 171638121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2417 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.806, "grad_norm": 1.0757739143940626e-09, "kl": 0.04119873046875, "learning_rate": 2.213916491002551e-06, "loss": 0.0016, "num_tokens": 171712617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2418 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8063333333333333, "grad_norm": 1.1906510222203792e-09, "kl": 0.04400634765625, "learning_rate": 2.206620350685257e-06, "loss": 0.0018, "num_tokens": 171788137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2419 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8066666666666666, "grad_norm": 2.6464888058796987e-09, "kl": 0.044921875, "learning_rate": 2.1993347614743355e-06, "loss": 0.0018, "num_tokens": 171868009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2420 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.807, "grad_norm": 1.6010388570464329e-09, "kl": 0.04620361328125, "learning_rate": 2.192059733233408e-06, "loss": 0.0019, "num_tokens": 171945033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2421 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8073333333333333, "grad_norm": 1.1750415085387544e-09, "kl": 0.04608154296875, "learning_rate": 2.1847952758118118e-06, "loss": 0.0018, "num_tokens": 172019161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2422 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8076666666666666, "grad_norm": 1.718983289045184e-09, "kl": 0.0462646484375, "learning_rate": 2.177541399044573e-06, "loss": 0.0018, "num_tokens": 172096217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2423 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.808, "grad_norm": 3.139448923050736e-09, "kl": 0.03826904296875, "learning_rate": 2.1702981127523827e-06, "loss": 0.0015, "num_tokens": 172178825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8083333333333333, "grad_norm": 1.327751797397525e-09, "kl": 0.042236328125, "learning_rate": 2.163065426741603e-06, "loss": 0.0017, "num_tokens": 172252041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2425 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8086666666666666, "grad_norm": 2.454078273927962e-09, "kl": 0.04833984375, "learning_rate": 2.155843350804243e-06, "loss": 0.0019, "num_tokens": 172329273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2426 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.809, "grad_norm": 2.9387117184143108e-09, "kl": 0.041015625, "learning_rate": 2.1486318947179476e-06, "loss": 0.0016, "num_tokens": 172408441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2427 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8093333333333333, "grad_norm": 1.4199103004486346e-09, "kl": 0.04315185546875, "learning_rate": 2.1414310682459805e-06, "loss": 0.0017, "num_tokens": 172489289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2428 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8096666666666666, "grad_norm": 2.194127768362364e-09, "kl": 0.04583740234375, "learning_rate": 2.1342408811372217e-06, "loss": 0.0018, "num_tokens": 172564889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2429 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.81, "grad_norm": 1.92905491580575e-09, "kl": 0.04998779296875, "learning_rate": 2.1270613431261367e-06, "loss": 0.002, "num_tokens": 172640985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2430 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8103333333333333, "grad_norm": 3.520464808559609e-09, "kl": 0.04705810546875, "learning_rate": 2.119892463932781e-06, "loss": 0.0019, "num_tokens": 172717913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2431 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8106666666666666, "grad_norm": 2.200611470826175e-09, "kl": 0.04595947265625, "learning_rate": 2.1127342532627794e-06, "loss": 0.0018, "num_tokens": 172795289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2432 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.811, "grad_norm": 1.505857216699269e-09, "kl": 0.0458984375, "learning_rate": 2.10558672080731e-06, "loss": 0.0018, "num_tokens": 172869497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2433 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8113333333333334, "grad_norm": 1.1992862258836112e-09, "kl": 0.0460205078125, "learning_rate": 2.098449876243096e-06, "loss": 0.0018, "num_tokens": 172944249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2434 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8116666666666666, "grad_norm": 1.9396788619729932e-09, "kl": 0.04730224609375, "learning_rate": 2.091323729232391e-06, "loss": 0.0019, "num_tokens": 173019625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2435 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.812, "grad_norm": 2.285576394811528e-09, "kl": 0.04571533203125, "learning_rate": 2.084208289422968e-06, "loss": 0.0018, "num_tokens": 173097305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2436 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8123333333333334, "grad_norm": 1.8372916521514071e-09, "kl": 0.0447998046875, "learning_rate": 2.0771035664480944e-06, "loss": 0.0018, "num_tokens": 173173145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2437 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8126666666666666, "grad_norm": 2.9534956702548243e-09, "kl": 0.0400390625, "learning_rate": 2.070009569926539e-06, "loss": 0.0016, "num_tokens": 173251737.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2438 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.813, "grad_norm": 1.6942774960782003e-09, "kl": 0.0478515625, "learning_rate": 2.0629263094625476e-06, "loss": 0.0019, "num_tokens": 173327369.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2439 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8133333333333334, "grad_norm": 1.3462093662042207e-09, "kl": 0.046630859375, "learning_rate": 2.0558537946458177e-06, "loss": 0.0019, "num_tokens": 173402409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2440 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8136666666666666, "grad_norm": 1.0766267877215796e-09, "kl": 0.043701171875, "learning_rate": 2.048792035051521e-06, "loss": 0.0017, "num_tokens": 173475225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2441 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.814, "grad_norm": 1.3438288259948195e-09, "kl": 0.04754638671875, "learning_rate": 2.041741040240255e-06, "loss": 0.0019, "num_tokens": 173550889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2442 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8143333333333334, "grad_norm": 9.137198775377442e-10, "kl": 0.0440673828125, "learning_rate": 2.0347008197580376e-06, "loss": 0.0018, "num_tokens": 173623337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2443 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8146666666666667, "grad_norm": 1.3256589159738041e-09, "kl": 0.04620361328125, "learning_rate": 2.0276713831363115e-06, "loss": 0.0018, "num_tokens": 173697769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2444 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.815, "grad_norm": 1.99012784030117e-09, "kl": 0.047119140625, "learning_rate": 2.020652739891914e-06, "loss": 0.0019, "num_tokens": 173777913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2445 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8153333333333334, "grad_norm": 1.1917519193715975e-09, "kl": 0.04669189453125, "learning_rate": 2.013644899527074e-06, "loss": 0.0019, "num_tokens": 173852729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2446 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8156666666666667, "grad_norm": 1.3997284442623936e-09, "kl": 0.04888916015625, "learning_rate": 2.0066478715293826e-06, "loss": 0.002, "num_tokens": 173928073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2447 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.816, "grad_norm": 1.4483105825746634e-09, "kl": 0.0443115234375, "learning_rate": 1.9996616653718126e-06, "loss": 0.0018, "num_tokens": 174001305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2448 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8163333333333334, "grad_norm": 1.1990284320972933e-09, "kl": 0.04388427734375, "learning_rate": 1.9926862905126663e-06, "loss": 0.0018, "num_tokens": 174077577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2449 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8166666666666667, "grad_norm": 1.0822991391989945e-09, "kl": 0.04290771484375, "learning_rate": 1.9857217563955932e-06, "loss": 0.0017, "num_tokens": 174152921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2450 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.817, "grad_norm": 1.5778539586008833e-09, "kl": 0.0482177734375, "learning_rate": 1.9787680724495617e-06, "loss": 0.0019, "num_tokens": 174227993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2451 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8173333333333334, "grad_norm": 1.4429178962771516e-09, "kl": 0.04400634765625, "learning_rate": 1.9718252480888567e-06, "loss": 0.0018, "num_tokens": 174302137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2452 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8176666666666667, "grad_norm": 2.0569510539303337e-09, "kl": 0.0455322265625, "learning_rate": 1.964893292713049e-06, "loss": 0.0018, "num_tokens": 174379369.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2453 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.818, "grad_norm": 1.238439573114647e-09, "kl": 0.05059814453125, "learning_rate": 1.9579722157070026e-06, "loss": 0.002, "num_tokens": 174455305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2454 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8183333333333334, "grad_norm": 1.5458189173145342e-09, "kl": 0.04705810546875, "learning_rate": 1.95106202644086e-06, "loss": 0.0019, "num_tokens": 174531097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2455 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8186666666666667, "grad_norm": 2.202642512827424e-09, "kl": 0.04974365234375, "learning_rate": 1.9441627342700067e-06, "loss": 0.002, "num_tokens": 174606793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2456 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.819, "grad_norm": 1.2999635812249721e-09, "kl": 0.04541015625, "learning_rate": 1.9372743485350887e-06, "loss": 0.0018, "num_tokens": 174682217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2457 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8193333333333334, "grad_norm": 9.890632757247886e-10, "kl": 0.04913330078125, "learning_rate": 1.930396878561983e-06, "loss": 0.002, "num_tokens": 174756841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2458 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8196666666666667, "grad_norm": 1.257830506418145e-09, "kl": 0.04547119140625, "learning_rate": 1.9235303336617827e-06, "loss": 0.0018, "num_tokens": 174830681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2459 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.82, "grad_norm": 2.058286208139748e-09, "kl": 0.04510498046875, "learning_rate": 1.916674723130796e-06, "loss": 0.0018, "num_tokens": 174906809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2460 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8203333333333334, "grad_norm": 1.7233940940997172e-09, "kl": 0.045166015625, "learning_rate": 1.9098300562505266e-06, "loss": 0.0018, "num_tokens": 174987689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2461 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8206666666666667, "grad_norm": 2.0414827606174413e-09, "kl": 0.04742431640625, "learning_rate": 1.9029963422876608e-06, "loss": 0.0019, "num_tokens": 175064297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2462 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.821, "grad_norm": 2.7579463157678674e-09, "kl": 0.0477294921875, "learning_rate": 1.896173590494057e-06, "loss": 0.0019, "num_tokens": 175141065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2463 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8213333333333334, "grad_norm": 1.9415296037550434e-09, "kl": 0.04522705078125, "learning_rate": 1.8893618101067357e-06, "loss": 0.0018, "num_tokens": 175218089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8216666666666667, "grad_norm": 1.0607219547154045e-09, "kl": 0.04754638671875, "learning_rate": 1.8825610103478531e-06, "loss": 0.0019, "num_tokens": 175291753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2465 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.822, "grad_norm": 1.4194794228927776e-09, "kl": 0.04461669921875, "learning_rate": 1.8757712004247098e-06, "loss": 0.0018, "num_tokens": 175365529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2466 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8223333333333334, "grad_norm": 1.5218216686818664e-09, "kl": 0.04693603515625, "learning_rate": 1.8689923895297247e-06, "loss": 0.0019, "num_tokens": 175441033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2467 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8226666666666667, "grad_norm": 1.8422288139419152e-09, "kl": 0.0443115234375, "learning_rate": 1.8622245868404244e-06, "loss": 0.0018, "num_tokens": 175516281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.823, "grad_norm": 1.6611644282349403e-09, "kl": 0.046142578125, "learning_rate": 1.8554678015194316e-06, "loss": 0.0018, "num_tokens": 175592457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2469 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8233333333333334, "grad_norm": 1.45108991489451e-09, "kl": 0.04278564453125, "learning_rate": 1.848722042714457e-06, "loss": 0.0017, "num_tokens": 175670009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2470 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8236666666666667, "grad_norm": 1.0495018187839378e-09, "kl": 0.04400634765625, "learning_rate": 1.8419873195582815e-06, "loss": 0.0018, "num_tokens": 175746457.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2471 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.824, "grad_norm": 1.646014213818603e-09, "kl": 0.046630859375, "learning_rate": 1.8352636411687374e-06, "loss": 0.0019, "num_tokens": 175824201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2472 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8243333333333334, "grad_norm": 1.6282787340671234e-09, "kl": 0.04693603515625, "learning_rate": 1.8285510166487154e-06, "loss": 0.0019, "num_tokens": 175897929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2473 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8246666666666667, "grad_norm": 2.00043515086179e-09, "kl": 0.0482177734375, "learning_rate": 1.8218494550861375e-06, "loss": 0.0019, "num_tokens": 175972937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2474 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.825, "grad_norm": 1.5168132305731774e-09, "kl": 0.046875, "learning_rate": 1.8151589655539391e-06, "loss": 0.0019, "num_tokens": 176047913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2475 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8253333333333334, "grad_norm": 2.0142749690421624e-09, "kl": 0.0452880859375, "learning_rate": 1.808479557110081e-06, "loss": 0.0018, "num_tokens": 176123241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2476 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8256666666666667, "grad_norm": 9.166812864336293e-10, "kl": 0.04437255859375, "learning_rate": 1.8018112387975139e-06, "loss": 0.0018, "num_tokens": 176197161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2477 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.826, "grad_norm": 1.8321856254388535e-09, "kl": 0.04608154296875, "learning_rate": 1.7951540196441698e-06, "loss": 0.0018, "num_tokens": 176273545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2478 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8263333333333334, "grad_norm": 1.1298240121249137e-09, "kl": 0.0419921875, "learning_rate": 1.7885079086629598e-06, "loss": 0.0017, "num_tokens": 176348169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2479 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8266666666666667, "grad_norm": 1.8433952142515864e-09, "kl": 0.0482177734375, "learning_rate": 1.7818729148517588e-06, "loss": 0.0019, "num_tokens": 176424809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2480 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.827, "grad_norm": 1.3199487058912496e-09, "kl": 0.0498046875, "learning_rate": 1.7752490471933769e-06, "loss": 0.002, "num_tokens": 176499145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2481 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8273333333333334, "grad_norm": 1.266582394521265e-09, "kl": 0.047119140625, "learning_rate": 1.7686363146555807e-06, "loss": 0.0019, "num_tokens": 176572665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2482 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8276666666666667, "grad_norm": 2.000862364681666e-09, "kl": 0.04522705078125, "learning_rate": 1.7620347261910498e-06, "loss": 0.0018, "num_tokens": 176648873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2483 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.828, "grad_norm": 1.9733867873128474e-09, "kl": 0.04534912109375, "learning_rate": 1.7554442907373736e-06, "loss": 0.0018, "num_tokens": 176724505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2484 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8283333333333334, "grad_norm": 1.661237813976868e-09, "kl": 0.04498291015625, "learning_rate": 1.7488650172170496e-06, "loss": 0.0018, "num_tokens": 176800857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2485 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8286666666666667, "grad_norm": 1.7345337388618987e-09, "kl": 0.0445556640625, "learning_rate": 1.742296914537459e-06, "loss": 0.0018, "num_tokens": 176875801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2486 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.829, "grad_norm": 1.6666518165564526e-09, "kl": 0.0455322265625, "learning_rate": 1.7357399915908646e-06, "loss": 0.0018, "num_tokens": 176950425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2487 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8293333333333334, "grad_norm": 1.1203760141853536e-09, "kl": 0.044677734375, "learning_rate": 1.7291942572543806e-06, "loss": 0.0018, "num_tokens": 177024665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2488 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8296666666666667, "grad_norm": 1.3555520039787439e-09, "kl": 0.04595947265625, "learning_rate": 1.7226597203899941e-06, "loss": 0.0018, "num_tokens": 177099049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2489 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.83, "grad_norm": 2.691412426258921e-09, "kl": 0.04595947265625, "learning_rate": 1.7161363898445138e-06, "loss": 0.0018, "num_tokens": 177175913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2490 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8303333333333334, "grad_norm": 1.6293999482996924e-09, "kl": 0.04681396484375, "learning_rate": 1.709624274449584e-06, "loss": 0.0019, "num_tokens": 177250521.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2491 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8306666666666667, "grad_norm": 2.158119905004696e-09, "kl": 0.04449462890625, "learning_rate": 1.7031233830216653e-06, "loss": 0.0018, "num_tokens": 177329273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2492 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.831, "grad_norm": 2.01321581627667e-09, "kl": 0.0457763671875, "learning_rate": 1.6966337243620267e-06, "loss": 0.0018, "num_tokens": 177406169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2493 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8313333333333334, "grad_norm": 1.530138238337031e-09, "kl": 0.0447998046875, "learning_rate": 1.6901553072567189e-06, "loss": 0.0018, "num_tokens": 177484841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2494 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8316666666666667, "grad_norm": 1.9579078358589186e-09, "kl": 0.04583740234375, "learning_rate": 1.6836881404765793e-06, "loss": 0.0018, "num_tokens": 177565001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2495 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.832, "grad_norm": 2.626694195484447e-09, "kl": 0.04620361328125, "learning_rate": 1.677232232777224e-06, "loss": 0.0018, "num_tokens": 177640665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2496 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8323333333333334, "grad_norm": 1.175638364436793e-09, "kl": 0.05035400390625, "learning_rate": 1.6707875928990059e-06, "loss": 0.002, "num_tokens": 177715865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2497 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8326666666666667, "grad_norm": 1.830399609659139e-09, "kl": 0.0457763671875, "learning_rate": 1.6643542295670367e-06, "loss": 0.0018, "num_tokens": 177790857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2498 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.833, "grad_norm": 1.662479043318399e-09, "kl": 0.042724609375, "learning_rate": 1.6579321514911606e-06, "loss": 0.0017, "num_tokens": 177866121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2499 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8333333333333334, "grad_norm": 1.0961127561159856e-09, "kl": 0.04705810546875, "learning_rate": 1.651521367365936e-06, "loss": 0.0019, "num_tokens": 177939785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2500 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8336666666666667, "grad_norm": 1.660401038883208e-09, "kl": 0.04241943359375, "learning_rate": 1.6451218858706374e-06, "loss": 0.0017, "num_tokens": 178016681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2501 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.834, "grad_norm": 1.316000863837985e-09, "kl": 0.04620361328125, "learning_rate": 1.638733715669234e-06, "loss": 0.0018, "num_tokens": 178091945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2502 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8343333333333334, "grad_norm": 2.3309589813891307e-09, "kl": 0.044921875, "learning_rate": 1.6323568654103838e-06, "loss": 0.0018, "num_tokens": 178169945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2503 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8346666666666667, "grad_norm": 1.5030010569461183e-09, "kl": 0.04351806640625, "learning_rate": 1.6259913437274167e-06, "loss": 0.0017, "num_tokens": 178245561.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2504 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.835, "grad_norm": 1.5137225917172259e-09, "kl": 0.04864501953125, "learning_rate": 1.6196371592383264e-06, "loss": 0.0019, "num_tokens": 178321113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2505 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8353333333333334, "grad_norm": 1.3590236713767467e-09, "kl": 0.04608154296875, "learning_rate": 1.6132943205457607e-06, "loss": 0.0018, "num_tokens": 178397753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2506 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8356666666666667, "grad_norm": 1.163644514079465e-09, "kl": 0.04473876953125, "learning_rate": 1.6069628362369993e-06, "loss": 0.0018, "num_tokens": 178471545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2507 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.836, "grad_norm": 1.188214304725932e-09, "kl": 0.04632568359375, "learning_rate": 1.6006427148839554e-06, "loss": 0.0019, "num_tokens": 178545225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2508 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8363333333333334, "grad_norm": 2.5142559145763244e-09, "kl": 0.04656982421875, "learning_rate": 1.5943339650431578e-06, "loss": 0.0019, "num_tokens": 178622825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2509 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8366666666666667, "grad_norm": 1.368064106443967e-09, "kl": 0.045166015625, "learning_rate": 1.5880365952557387e-06, "loss": 0.0018, "num_tokens": 178699401.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2510 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.837, "grad_norm": 2.8552809006043844e-09, "kl": 0.04779052734375, "learning_rate": 1.5817506140474248e-06, "loss": 0.0019, "num_tokens": 178779993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2511 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8373333333333334, "grad_norm": 1.5192601621194513e-09, "kl": 0.0435791015625, "learning_rate": 1.5754760299285255e-06, "loss": 0.0017, "num_tokens": 178854265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2512 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8376666666666667, "grad_norm": 1.8722465799925203e-09, "kl": 0.04888916015625, "learning_rate": 1.5692128513939142e-06, "loss": 0.002, "num_tokens": 178930073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2513 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.838, "grad_norm": 3.5012095445097202e-09, "kl": 0.04913330078125, "learning_rate": 1.5629610869230272e-06, "loss": 0.002, "num_tokens": 179008233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2514 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8383333333333334, "grad_norm": 2.2937911570153346e-09, "kl": 0.04669189453125, "learning_rate": 1.5567207449798517e-06, "loss": 0.0019, "num_tokens": 179085193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2515 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8386666666666667, "grad_norm": 1.9419670316267457e-09, "kl": 0.0465087890625, "learning_rate": 1.5504918340128982e-06, "loss": 0.0019, "num_tokens": 179162873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.839, "grad_norm": 1.2155141337899522e-09, "kl": 0.04901123046875, "learning_rate": 1.544274362455216e-06, "loss": 0.002, "num_tokens": 179238377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2517 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8393333333333334, "grad_norm": 2.5740238829286e-09, "kl": 0.0487060546875, "learning_rate": 1.538068338724361e-06, "loss": 0.0019, "num_tokens": 179314825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2518 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8396666666666667, "grad_norm": 1.0880418788161705e-09, "kl": 0.0439453125, "learning_rate": 1.5318737712223853e-06, "loss": 0.0018, "num_tokens": 179389625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2519 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.84, "grad_norm": 1.2253025261088624e-09, "kl": 0.04290771484375, "learning_rate": 1.5256906683358364e-06, "loss": 0.0017, "num_tokens": 179464521.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2520 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8403333333333334, "grad_norm": 2.1616044509897847e-09, "kl": 0.0482177734375, "learning_rate": 1.5195190384357405e-06, "loss": 0.0019, "num_tokens": 179541609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2521 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8406666666666667, "grad_norm": 2.509201069145206e-09, "kl": 0.04925537109375, "learning_rate": 1.513358889877592e-06, "loss": 0.002, "num_tokens": 179617593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2522 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.841, "grad_norm": 1.9853581001427756e-09, "kl": 0.04693603515625, "learning_rate": 1.5072102310013314e-06, "loss": 0.0019, "num_tokens": 179695113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2523 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8413333333333334, "grad_norm": 1.2919806335887074e-09, "kl": 0.04595947265625, "learning_rate": 1.5010730701313626e-06, "loss": 0.0018, "num_tokens": 179769481.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2524 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8416666666666667, "grad_norm": 1.0720540011277535e-09, "kl": 0.0447998046875, "learning_rate": 1.494947415576502e-06, "loss": 0.0018, "num_tokens": 179844121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2525 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.842, "grad_norm": 2.1835697694427836e-09, "kl": 0.04864501953125, "learning_rate": 1.4888332756300027e-06, "loss": 0.0019, "num_tokens": 179921577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2526 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8423333333333334, "grad_norm": 2.063119675099756e-09, "kl": 0.0457763671875, "learning_rate": 1.4827306585695234e-06, "loss": 0.0018, "num_tokens": 179997225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2527 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8426666666666667, "grad_norm": 2.2825248358060435e-09, "kl": 0.04632568359375, "learning_rate": 1.4766395726571258e-06, "loss": 0.0019, "num_tokens": 180073641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2528 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.843, "grad_norm": 1.7258594553481998e-09, "kl": 0.04486083984375, "learning_rate": 1.4705600261392505e-06, "loss": 0.0018, "num_tokens": 180154761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2529 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8433333333333334, "grad_norm": 1.3981876767488188e-09, "kl": 0.04888916015625, "learning_rate": 1.4644920272467245e-06, "loss": 0.002, "num_tokens": 180228681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2530 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8436666666666667, "grad_norm": 2.0124735211624056e-09, "kl": 0.04620361328125, "learning_rate": 1.4584355841947452e-06, "loss": 0.0019, "num_tokens": 180303625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2531 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.844, "grad_norm": 1.558583484495557e-09, "kl": 0.04425048828125, "learning_rate": 1.4523907051828502e-06, "loss": 0.0018, "num_tokens": 180378153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2532 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8443333333333334, "grad_norm": 1.2809792115930918e-09, "kl": 0.0445556640625, "learning_rate": 1.446357398394934e-06, "loss": 0.0018, "num_tokens": 180452937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2533 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8446666666666667, "grad_norm": 1.6429358984382247e-09, "kl": 0.0430908203125, "learning_rate": 1.4403356719992201e-06, "loss": 0.0017, "num_tokens": 180527961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2534 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.845, "grad_norm": 1.6691761306475428e-09, "kl": 0.04339599609375, "learning_rate": 1.4343255341482486e-06, "loss": 0.0017, "num_tokens": 180603529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2535 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8453333333333334, "grad_norm": 1.4524653701997181e-09, "kl": 0.04449462890625, "learning_rate": 1.4283269929788779e-06, "loss": 0.0018, "num_tokens": 180677929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2536 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8456666666666667, "grad_norm": 1.9534622808237145e-09, "kl": 0.0458984375, "learning_rate": 1.4223400566122635e-06, "loss": 0.0018, "num_tokens": 180752777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2537 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.846, "grad_norm": 1.9489254654558863e-09, "kl": 0.048095703125, "learning_rate": 1.416364733153849e-06, "loss": 0.0019, "num_tokens": 180828233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2538 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8463333333333334, "grad_norm": 1.6075731856801667e-09, "kl": 0.0465087890625, "learning_rate": 1.4104010306933558e-06, "loss": 0.0019, "num_tokens": 180905609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2539 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8466666666666667, "grad_norm": 1.5459985513999186e-09, "kl": 0.0455322265625, "learning_rate": 1.4044489573047759e-06, "loss": 0.0018, "num_tokens": 180983625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2540 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.847, "grad_norm": 1.1015142131753919e-09, "kl": 0.04754638671875, "learning_rate": 1.3985085210463479e-06, "loss": 0.0019, "num_tokens": 181058265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2541 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8473333333333334, "grad_norm": 1.4479096810404712e-09, "kl": 0.04193115234375, "learning_rate": 1.3925797299605649e-06, "loss": 0.0017, "num_tokens": 181133993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2542 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8476666666666667, "grad_norm": 1.9511812165973197e-09, "kl": 0.04815673828125, "learning_rate": 1.3866625920741495e-06, "loss": 0.0019, "num_tokens": 181210969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2543 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.848, "grad_norm": 1.7412599140342877e-09, "kl": 0.04486083984375, "learning_rate": 1.3807571153980504e-06, "loss": 0.0018, "num_tokens": 181286105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2544 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8483333333333334, "grad_norm": 1.3836660706090242e-09, "kl": 0.04278564453125, "learning_rate": 1.3748633079274254e-06, "loss": 0.0017, "num_tokens": 181362201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2545 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8486666666666667, "grad_norm": 1.5559509236595659e-09, "kl": 0.0435791015625, "learning_rate": 1.368981177641636e-06, "loss": 0.0017, "num_tokens": 181439593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2546 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.849, "grad_norm": 1.6364407606772602e-09, "kl": 0.0433349609375, "learning_rate": 1.363110732504237e-06, "loss": 0.0017, "num_tokens": 181515721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2547 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8493333333333334, "grad_norm": 2.5346389431746275e-09, "kl": 0.04779052734375, "learning_rate": 1.3572519804629537e-06, "loss": 0.0019, "num_tokens": 181594393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2548 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8496666666666667, "grad_norm": 8.960080455366892e-10, "kl": 0.04595947265625, "learning_rate": 1.3514049294496911e-06, "loss": 0.0018, "num_tokens": 181671049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2549 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.85, "grad_norm": 1.3987283553618113e-09, "kl": 0.04547119140625, "learning_rate": 1.3455695873805086e-06, "loss": 0.0018, "num_tokens": 181745385.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2550 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8503333333333334, "grad_norm": 1.5169235867418251e-09, "kl": 0.043701171875, "learning_rate": 1.339745962155613e-06, "loss": 0.0017, "num_tokens": 181821433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2551 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8506666666666667, "grad_norm": 1.0315400755800397e-09, "kl": 0.045654296875, "learning_rate": 1.3339340616593487e-06, "loss": 0.0018, "num_tokens": 181895337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2552 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.851, "grad_norm": 1.4944254722593087e-09, "kl": 0.04644775390625, "learning_rate": 1.3281338937601895e-06, "loss": 0.0019, "num_tokens": 181969641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2553 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8513333333333334, "grad_norm": 2.0457020522002267e-09, "kl": 0.047607421875, "learning_rate": 1.322345466310717e-06, "loss": 0.0019, "num_tokens": 182045561.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2554 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8516666666666667, "grad_norm": 1.4869813158568945e-09, "kl": 0.0462646484375, "learning_rate": 1.316568787147624e-06, "loss": 0.0018, "num_tokens": 182122073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2555 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.852, "grad_norm": 1.9542290008445207e-09, "kl": 0.0469970703125, "learning_rate": 1.3108038640916988e-06, "loss": 0.0019, "num_tokens": 182196057.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2556 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8523333333333334, "grad_norm": 8.708965770765076e-10, "kl": 0.040771484375, "learning_rate": 1.30505070494781e-06, "loss": 0.0016, "num_tokens": 182270121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2557 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8526666666666667, "grad_norm": 1.6140492276051077e-09, "kl": 0.04754638671875, "learning_rate": 1.2993093175049022e-06, "loss": 0.0019, "num_tokens": 182346313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2558 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.853, "grad_norm": 1.5871274294809723e-09, "kl": 0.047119140625, "learning_rate": 1.2935797095359825e-06, "loss": 0.0019, "num_tokens": 182421465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2559 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8533333333333334, "grad_norm": 1.9031267672886543e-09, "kl": 0.0439453125, "learning_rate": 1.2878618887981064e-06, "loss": 0.0018, "num_tokens": 182497305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2560 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8536666666666667, "grad_norm": 2.1138784056518034e-09, "kl": 0.04833984375, "learning_rate": 1.282155863032377e-06, "loss": 0.0019, "num_tokens": 182573881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2561 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.854, "grad_norm": 1.5848523604589104e-09, "kl": 0.0474853515625, "learning_rate": 1.2764616399639252e-06, "loss": 0.0019, "num_tokens": 182650345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2562 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8543333333333333, "grad_norm": 1.354097167727275e-09, "kl": 0.04937744140625, "learning_rate": 1.2707792273019049e-06, "loss": 0.002, "num_tokens": 182725337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2563 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8546666666666667, "grad_norm": 1.7183370282225496e-09, "kl": 0.0455322265625, "learning_rate": 1.2651086327394745e-06, "loss": 0.0018, "num_tokens": 182801113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2564 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.855, "grad_norm": 2.0987287463469784e-09, "kl": 0.04620361328125, "learning_rate": 1.2594498639538032e-06, "loss": 0.0018, "num_tokens": 182876425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2565 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8553333333333333, "grad_norm": 2.755427441769598e-09, "kl": 0.0439453125, "learning_rate": 1.2538029286060428e-06, "loss": 0.0018, "num_tokens": 182952105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2566 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8556666666666667, "grad_norm": 1.520349401928911e-09, "kl": 0.04425048828125, "learning_rate": 1.2481678343413216e-06, "loss": 0.0018, "num_tokens": 183027625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2567 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.856, "grad_norm": 1.4083348931492878e-09, "kl": 0.04437255859375, "learning_rate": 1.2425445887887422e-06, "loss": 0.0018, "num_tokens": 183103353.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2568 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8563333333333333, "grad_norm": 2.595440973252039e-09, "kl": 0.0457763671875, "learning_rate": 1.2369331995613664e-06, "loss": 0.0018, "num_tokens": 183182729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2569 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8566666666666667, "grad_norm": 1.3948296961885376e-09, "kl": 0.044921875, "learning_rate": 1.2313336742561965e-06, "loss": 0.0018, "num_tokens": 183257769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2570 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.857, "grad_norm": 1.1847309799861705e-09, "kl": 0.04510498046875, "learning_rate": 1.2257460204541793e-06, "loss": 0.0018, "num_tokens": 183333721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2571 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8573333333333333, "grad_norm": 2.0406178968812583e-09, "kl": 0.04443359375, "learning_rate": 1.2201702457201948e-06, "loss": 0.0018, "num_tokens": 183407577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2572 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8576666666666667, "grad_norm": 1.4084077237797032e-09, "kl": 0.04486083984375, "learning_rate": 1.2146063576030265e-06, "loss": 0.0018, "num_tokens": 183484905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2573 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.858, "grad_norm": 1.9261217065746905e-09, "kl": 0.04296875, "learning_rate": 1.2090543636353746e-06, "loss": 0.0017, "num_tokens": 183561609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2574 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8583333333333333, "grad_norm": 1.1394422072541488e-09, "kl": 0.046142578125, "learning_rate": 1.2035142713338366e-06, "loss": 0.0018, "num_tokens": 183637225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2575 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8586666666666667, "grad_norm": 2.962248224491759e-09, "kl": 0.0479736328125, "learning_rate": 1.1979860881988903e-06, "loss": 0.0019, "num_tokens": 183715209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2576 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.859, "grad_norm": 1.272532967888651e-09, "kl": 0.045166015625, "learning_rate": 1.1924698217148955e-06, "loss": 0.0018, "num_tokens": 183790585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2577 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8593333333333333, "grad_norm": 2.0194408367757433e-09, "kl": 0.04486083984375, "learning_rate": 1.1869654793500784e-06, "loss": 0.0018, "num_tokens": 183865241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2578 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8596666666666667, "grad_norm": 2.5757318500296833e-09, "kl": 0.0457763671875, "learning_rate": 1.18147306855652e-06, "loss": 0.0018, "num_tokens": 183943097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2579 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.86, "grad_norm": 2.58563370714171e-09, "kl": 0.044189453125, "learning_rate": 1.1759925967701491e-06, "loss": 0.0018, "num_tokens": 184021593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2580 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8603333333333333, "grad_norm": 1.808248439871818e-09, "kl": 0.04376220703125, "learning_rate": 1.1705240714107301e-06, "loss": 0.0018, "num_tokens": 184096265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2581 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8606666666666667, "grad_norm": 1.5734378244758318e-09, "kl": 0.04351806640625, "learning_rate": 1.1650674998818556e-06, "loss": 0.0017, "num_tokens": 184170233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2582 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.861, "grad_norm": 1.3491952000066476e-09, "kl": 0.04541015625, "learning_rate": 1.159622889570927e-06, "loss": 0.0018, "num_tokens": 184246425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2583 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8613333333333333, "grad_norm": 1.0357107393943465e-09, "kl": 0.04217529296875, "learning_rate": 1.1541902478491607e-06, "loss": 0.0017, "num_tokens": 184323337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2584 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8616666666666667, "grad_norm": 1.6043160133705214e-09, "kl": 0.04803466796875, "learning_rate": 1.1487695820715672e-06, "loss": 0.0019, "num_tokens": 184398537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2585 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.862, "grad_norm": 1.5244179252249523e-09, "kl": 0.04388427734375, "learning_rate": 1.1433608995769396e-06, "loss": 0.0018, "num_tokens": 184473897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2586 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8623333333333333, "grad_norm": 9.53287337956965e-10, "kl": 0.04425048828125, "learning_rate": 1.1379642076878528e-06, "loss": 0.0018, "num_tokens": 184548905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2587 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8626666666666667, "grad_norm": 1.6500151245324446e-09, "kl": 0.04620361328125, "learning_rate": 1.1325795137106455e-06, "loss": 0.0018, "num_tokens": 184624041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2588 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.863, "grad_norm": 2.1974935204838175e-09, "kl": 0.04559326171875, "learning_rate": 1.1272068249354085e-06, "loss": 0.0018, "num_tokens": 184700393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2589 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8633333333333333, "grad_norm": 2.0920136734048356e-09, "kl": 0.04754638671875, "learning_rate": 1.1218461486359878e-06, "loss": 0.0019, "num_tokens": 184778697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2590 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8636666666666667, "grad_norm": 1.40344147414595e-09, "kl": 0.04840087890625, "learning_rate": 1.1164974920699611e-06, "loss": 0.0019, "num_tokens": 184853849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2591 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.864, "grad_norm": 1.3430054845997574e-09, "kl": 0.049560546875, "learning_rate": 1.1111608624786307e-06, "loss": 0.002, "num_tokens": 184928217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2592 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8643333333333333, "grad_norm": 1.3960522737832548e-09, "kl": 0.04669189453125, "learning_rate": 1.1058362670870248e-06, "loss": 0.0019, "num_tokens": 185004889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2593 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8646666666666667, "grad_norm": 1.172054120424093e-09, "kl": 0.0474853515625, "learning_rate": 1.1005237131038725e-06, "loss": 0.0019, "num_tokens": 185079417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2594 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.865, "grad_norm": 1.5231490513301083e-09, "kl": 0.04437255859375, "learning_rate": 1.0952232077215985e-06, "loss": 0.0018, "num_tokens": 185154041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2595 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8653333333333333, "grad_norm": 1.0784830806187529e-09, "kl": 0.0435791015625, "learning_rate": 1.0899347581163222e-06, "loss": 0.0017, "num_tokens": 185228393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2596 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8656666666666667, "grad_norm": 1.383213543704187e-09, "kl": 0.04345703125, "learning_rate": 1.0846583714478355e-06, "loss": 0.0017, "num_tokens": 185306633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2597 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.866, "grad_norm": 1.6527903490271e-09, "kl": 0.04644775390625, "learning_rate": 1.0793940548596048e-06, "loss": 0.0019, "num_tokens": 185382905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2598 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8663333333333333, "grad_norm": 1.1343456174373046e-09, "kl": 0.04541015625, "learning_rate": 1.0741418154787443e-06, "loss": 0.0018, "num_tokens": 185461705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2599 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8666666666666667, "grad_norm": 1.1936367450005037e-09, "kl": 0.04541015625, "learning_rate": 1.0689016604160341e-06, "loss": 0.0018, "num_tokens": 185536153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2600 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.867, "grad_norm": 1.4188243913082488e-09, "kl": 0.04827880859375, "learning_rate": 1.0636735967658785e-06, "loss": 0.0019, "num_tokens": 185611689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2601 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8673333333333333, "grad_norm": 1.69647040859644e-09, "kl": 0.04498291015625, "learning_rate": 1.058457631606319e-06, "loss": 0.0018, "num_tokens": 185688777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2602 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8676666666666667, "grad_norm": 1.7936276908159243e-09, "kl": 0.045654296875, "learning_rate": 1.0532537719990166e-06, "loss": 0.0018, "num_tokens": 185765897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2603 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.868, "grad_norm": 1.2953073058596942e-09, "kl": 0.04376220703125, "learning_rate": 1.0480620249892448e-06, "loss": 0.0018, "num_tokens": 185840137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2604 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8683333333333333, "grad_norm": 1.4768137823750749e-09, "kl": 0.04364013671875, "learning_rate": 1.042882397605871e-06, "loss": 0.0017, "num_tokens": 185918281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2605 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8686666666666667, "grad_norm": 1.893451395673651e-09, "kl": 0.0491943359375, "learning_rate": 1.0377148968613659e-06, "loss": 0.002, "num_tokens": 185996169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2606 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.869, "grad_norm": 1.5184105084387056e-09, "kl": 0.04473876953125, "learning_rate": 1.0325595297517753e-06, "loss": 0.0018, "num_tokens": 186071513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2607 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8693333333333333, "grad_norm": 1.0488743207304196e-09, "kl": 0.04449462890625, "learning_rate": 1.0274163032567165e-06, "loss": 0.0018, "num_tokens": 186150121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2608 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8696666666666667, "grad_norm": 2.3841635332644273e-09, "kl": 0.049560546875, "learning_rate": 1.0222852243393732e-06, "loss": 0.002, "num_tokens": 186225529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2609 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.87, "grad_norm": 3.7211060899267068e-09, "kl": 0.04638671875, "learning_rate": 1.017166299946486e-06, "loss": 0.0019, "num_tokens": 186304393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2610 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8703333333333333, "grad_norm": 1.3185385006053707e-09, "kl": 0.04705810546875, "learning_rate": 1.012059537008332e-06, "loss": 0.0019, "num_tokens": 186384697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2611 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8706666666666667, "grad_norm": 1.0482662515798324e-09, "kl": 0.0438232421875, "learning_rate": 1.0069649424387274e-06, "loss": 0.0018, "num_tokens": 186460153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2612 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.871, "grad_norm": 1.4424126337786447e-09, "kl": 0.0447998046875, "learning_rate": 1.0018825231350203e-06, "loss": 0.0018, "num_tokens": 186535689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2613 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8713333333333333, "grad_norm": 1.5386187879329327e-09, "kl": 0.04437255859375, "learning_rate": 9.968122859780648e-07, "loss": 0.0018, "num_tokens": 186610761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2614 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8716666666666667, "grad_norm": 1.954133299619798e-09, "kl": 0.0450439453125, "learning_rate": 9.917542378322299e-07, "loss": 0.0018, "num_tokens": 186688297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2615 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.872, "grad_norm": 2.2020136825062764e-09, "kl": 0.04656982421875, "learning_rate": 9.867083855453775e-07, "loss": 0.0019, "num_tokens": 186765817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2616 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8723333333333333, "grad_norm": 1.8144351576765416e-09, "kl": 0.0462646484375, "learning_rate": 9.816747359488632e-07, "loss": 0.0019, "num_tokens": 186841897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2617 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8726666666666667, "grad_norm": 2.402335663731492e-09, "kl": 0.04559326171875, "learning_rate": 9.766532958575158e-07, "loss": 0.0018, "num_tokens": 186916025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2618 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.873, "grad_norm": 1.4105491219496002e-09, "kl": 0.04754638671875, "learning_rate": 9.716440720696375e-07, "loss": 0.0019, "num_tokens": 186991257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2619 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8733333333333333, "grad_norm": 1.0827277963088022e-09, "kl": 0.04827880859375, "learning_rate": 9.666470713669918e-07, "loss": 0.0019, "num_tokens": 187065145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2620 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8736666666666667, "grad_norm": 1.6385162115994945e-09, "kl": 0.0472412109375, "learning_rate": 9.616623005147952e-07, "loss": 0.0019, "num_tokens": 187141993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2621 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.874, "grad_norm": 2.8021986953064015e-09, "kl": 0.0467529296875, "learning_rate": 9.566897662617014e-07, "loss": 0.0019, "num_tokens": 187218041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2622 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8743333333333333, "grad_norm": 1.4414750504343488e-09, "kl": 0.04541015625, "learning_rate": 9.517294753398066e-07, "loss": 0.0018, "num_tokens": 187293609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2623 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8746666666666667, "grad_norm": 1.9151751295964914e-09, "kl": 0.0458984375, "learning_rate": 9.467814344646187e-07, "loss": 0.0018, "num_tokens": 187370585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2624 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.875, "grad_norm": 1.572804664284888e-09, "kl": 0.047119140625, "learning_rate": 9.418456503350714e-07, "loss": 0.0019, "num_tokens": 187444041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2625 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8753333333333333, "grad_norm": 1.2705704266480211e-09, "kl": 0.0433349609375, "learning_rate": 9.369221296335007e-07, "loss": 0.0017, "num_tokens": 187519433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2626 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8756666666666667, "grad_norm": 2.4518764796255255e-09, "kl": 0.04833984375, "learning_rate": 9.320108790256399e-07, "loss": 0.0019, "num_tokens": 187598649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2627 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.876, "grad_norm": 8.608486146144401e-10, "kl": 0.0474853515625, "learning_rate": 9.271119051606103e-07, "loss": 0.0019, "num_tokens": 187671929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2628 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8763333333333333, "grad_norm": 1.4270256087911548e-09, "kl": 0.0474853515625, "learning_rate": 9.222252146709143e-07, "loss": 0.0019, "num_tokens": 187747209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2629 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8766666666666667, "grad_norm": 1.2907819257890196e-09, "kl": 0.0416259765625, "learning_rate": 9.173508141724197e-07, "loss": 0.0017, "num_tokens": 187823049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2630 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.877, "grad_norm": 1.7641250682487453e-09, "kl": 0.04388427734375, "learning_rate": 9.124887102643576e-07, "loss": 0.0018, "num_tokens": 187899369.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2631 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8773333333333333, "grad_norm": 2.342011917733089e-09, "kl": 0.0478515625, "learning_rate": 9.076389095293148e-07, "loss": 0.0019, "num_tokens": 187981561.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2632 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8776666666666667, "grad_norm": 9.404841350146853e-10, "kl": 0.042236328125, "learning_rate": 9.028014185332168e-07, "loss": 0.0017, "num_tokens": 188055641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2633 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.878, "grad_norm": 3.865779696354821e-09, "kl": 0.0428466796875, "learning_rate": 8.979762438253259e-07, "loss": 0.0017, "num_tokens": 188131641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2634 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8783333333333333, "grad_norm": 2.162332757293939e-09, "kl": 0.046630859375, "learning_rate": 8.931633919382299e-07, "loss": 0.0019, "num_tokens": 188210169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2635 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8786666666666667, "grad_norm": 2.2158062051857996e-09, "kl": 0.03955078125, "learning_rate": 8.883628693878299e-07, "loss": 0.0016, "num_tokens": 188286105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2636 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.879, "grad_norm": 1.310844544022416e-09, "kl": 0.0462646484375, "learning_rate": 8.835746826733404e-07, "loss": 0.0019, "num_tokens": 188360393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2637 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8793333333333333, "grad_norm": 1.2211397448780303e-09, "kl": 0.0443115234375, "learning_rate": 8.787988382772705e-07, "loss": 0.0018, "num_tokens": 188436729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2638 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8796666666666667, "grad_norm": 2.4066997283966884e-09, "kl": 0.05029296875, "learning_rate": 8.740353426654236e-07, "loss": 0.002, "num_tokens": 188515049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2639 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.88, "grad_norm": 1.4026209083084495e-09, "kl": 0.04486083984375, "learning_rate": 8.692842022868764e-07, "loss": 0.0018, "num_tokens": 188590969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2640 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8803333333333333, "grad_norm": 1.4049891250422775e-09, "kl": 0.04443359375, "learning_rate": 8.645454235739903e-07, "loss": 0.0018, "num_tokens": 188665465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2641 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8806666666666667, "grad_norm": 1.828218576527263e-09, "kl": 0.046142578125, "learning_rate": 8.598190129423844e-07, "loss": 0.0018, "num_tokens": 188742265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2642 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.881, "grad_norm": 1.9109716031806556e-09, "kl": 0.047119140625, "learning_rate": 8.551049767909314e-07, "loss": 0.0019, "num_tokens": 188817577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2643 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8813333333333333, "grad_norm": 2.4950443933136057e-09, "kl": 0.04620361328125, "learning_rate": 8.504033215017527e-07, "loss": 0.0018, "num_tokens": 188892169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2644 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8816666666666667, "grad_norm": 1.2899302737068297e-09, "kl": 0.04339599609375, "learning_rate": 8.457140534402098e-07, "loss": 0.0017, "num_tokens": 188969529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2645 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.882, "grad_norm": 1.6007293268671674e-09, "kl": 0.04376220703125, "learning_rate": 8.41037178954891e-07, "loss": 0.0017, "num_tokens": 189046265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2646 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8823333333333333, "grad_norm": 1.557202033986016e-09, "kl": 0.04937744140625, "learning_rate": 8.363727043776037e-07, "loss": 0.002, "num_tokens": 189123001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2647 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8826666666666667, "grad_norm": 1.2985638120355247e-09, "kl": 0.04388427734375, "learning_rate": 8.317206360233765e-07, "loss": 0.0018, "num_tokens": 189197801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2648 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.883, "grad_norm": 1.1025262924846402e-09, "kl": 0.04510498046875, "learning_rate": 8.270809801904301e-07, "loss": 0.0018, "num_tokens": 189271417.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2649 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8833333333333333, "grad_norm": 9.770765307948182e-10, "kl": 0.04339599609375, "learning_rate": 8.224537431601886e-07, "loss": 0.0017, "num_tokens": 189346841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2650 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8836666666666667, "grad_norm": 1.3961618527957853e-09, "kl": 0.04205322265625, "learning_rate": 8.178389311972612e-07, "loss": 0.0017, "num_tokens": 189422569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2651 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.884, "grad_norm": 2.638132601262555e-09, "kl": 0.0469970703125, "learning_rate": 8.13236550549431e-07, "loss": 0.0019, "num_tokens": 189500713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2652 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8843333333333333, "grad_norm": 1.4742652654220478e-09, "kl": 0.0465087890625, "learning_rate": 8.086466074476562e-07, "loss": 0.0019, "num_tokens": 189575305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2653 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8846666666666667, "grad_norm": 9.411876833453903e-10, "kl": 0.04901123046875, "learning_rate": 8.040691081060548e-07, "loss": 0.002, "num_tokens": 189649113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2654 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.885, "grad_norm": 1.8089580944291583e-09, "kl": 0.04638671875, "learning_rate": 7.99504058721896e-07, "loss": 0.0019, "num_tokens": 189726409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2655 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8853333333333333, "grad_norm": 1.2890151168676311e-09, "kl": 0.0472412109375, "learning_rate": 7.949514654755963e-07, "loss": 0.0019, "num_tokens": 189801529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2656 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8856666666666667, "grad_norm": 1.8693517844781127e-09, "kl": 0.0450439453125, "learning_rate": 7.904113345307073e-07, "loss": 0.0018, "num_tokens": 189879705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2657 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.886, "grad_norm": 1.2553108552637582e-09, "kl": 0.041748046875, "learning_rate": 7.85883672033908e-07, "loss": 0.0017, "num_tokens": 189953529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2658 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8863333333333333, "grad_norm": 1.6984924577911897e-09, "kl": 0.04644775390625, "learning_rate": 7.81368484114996e-07, "loss": 0.0019, "num_tokens": 190029129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2659 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8866666666666667, "grad_norm": 1.9010564233923333e-09, "kl": 0.04718017578125, "learning_rate": 7.768657768868803e-07, "loss": 0.0019, "num_tokens": 190105545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2660 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.887, "grad_norm": 1.2692297213234838e-09, "kl": 0.04833984375, "learning_rate": 7.723755564455771e-07, "loss": 0.0019, "num_tokens": 190179385.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2661 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8873333333333333, "grad_norm": 2.097444662396697e-09, "kl": 0.04620361328125, "learning_rate": 7.678978288701911e-07, "loss": 0.0018, "num_tokens": 190256297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2662 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8876666666666667, "grad_norm": 1.4038258333570752e-09, "kl": 0.04620361328125, "learning_rate": 7.634326002229175e-07, "loss": 0.0018, "num_tokens": 190330505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2663 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.888, "grad_norm": 1.939425287034169e-09, "kl": 0.046142578125, "learning_rate": 7.589798765490308e-07, "loss": 0.0018, "num_tokens": 190407161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2664 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8883333333333333, "grad_norm": 1.2057269627163691e-09, "kl": 0.04296875, "learning_rate": 7.545396638768698e-07, "loss": 0.0017, "num_tokens": 190481785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2665 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8886666666666667, "grad_norm": 1.552492467915556e-09, "kl": 0.04150390625, "learning_rate": 7.501119682178392e-07, "loss": 0.0017, "num_tokens": 190556217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2666 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.889, "grad_norm": 1.681061512215365e-09, "kl": 0.047119140625, "learning_rate": 7.456967955663996e-07, "loss": 0.0019, "num_tokens": 190631897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2667 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8893333333333333, "grad_norm": 9.356240227020862e-10, "kl": 0.04205322265625, "learning_rate": 7.412941519000527e-07, "loss": 0.0017, "num_tokens": 190707849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2668 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8896666666666667, "grad_norm": 1.1852688830416014e-09, "kl": 0.04541015625, "learning_rate": 7.369040431793406e-07, "loss": 0.0018, "num_tokens": 190781753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2669 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.89, "grad_norm": 1.3693829403749191e-09, "kl": 0.0457763671875, "learning_rate": 7.325264753478356e-07, "loss": 0.0018, "num_tokens": 190856585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2670 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8903333333333333, "grad_norm": 1.2500912527357855e-09, "kl": 0.04437255859375, "learning_rate": 7.281614543321269e-07, "loss": 0.0018, "num_tokens": 190934665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2671 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8906666666666667, "grad_norm": 1.3919477792612156e-09, "kl": 0.0418701171875, "learning_rate": 7.238089860418218e-07, "loss": 0.0017, "num_tokens": 191011209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2672 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.891, "grad_norm": 1.2578008634633875e-09, "kl": 0.0447998046875, "learning_rate": 7.194690763695312e-07, "loss": 0.0018, "num_tokens": 191084441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2673 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8913333333333333, "grad_norm": 1.169407903844899e-09, "kl": 0.04522705078125, "learning_rate": 7.151417311908648e-07, "loss": 0.0018, "num_tokens": 191160489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8916666666666667, "grad_norm": 1.7708263744253827e-09, "kl": 0.04779052734375, "learning_rate": 7.108269563644188e-07, "loss": 0.0019, "num_tokens": 191236841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2675 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.892, "grad_norm": 1.6522823109710316e-09, "kl": 0.04437255859375, "learning_rate": 7.065247577317747e-07, "loss": 0.0018, "num_tokens": 191311033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2676 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8923333333333333, "grad_norm": 3.0241904536154607e-09, "kl": 0.0447998046875, "learning_rate": 7.022351411174866e-07, "loss": 0.0018, "num_tokens": 191390281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2677 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8926666666666667, "grad_norm": 2.002348953311639e-09, "kl": 0.04443359375, "learning_rate": 6.979581123290702e-07, "loss": 0.0018, "num_tokens": 191466761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2678 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.893, "grad_norm": 1.2075672684019878e-09, "kl": 0.0467529296875, "learning_rate": 6.936936771570046e-07, "loss": 0.0019, "num_tokens": 191540665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2679 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8933333333333333, "grad_norm": 2.1923927118194797e-09, "kl": 0.04754638671875, "learning_rate": 6.894418413747183e-07, "loss": 0.0019, "num_tokens": 191615833.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2680 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8936666666666667, "grad_norm": 1.2265195525884565e-09, "kl": 0.0474853515625, "learning_rate": 6.852026107385756e-07, "loss": 0.0019, "num_tokens": 191690089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2681 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.894, "grad_norm": 3.240332668852375e-09, "kl": 0.048583984375, "learning_rate": 6.809759909878855e-07, "loss": 0.0019, "num_tokens": 191769321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2682 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8943333333333333, "grad_norm": 2.8002382634895184e-09, "kl": 0.04852294921875, "learning_rate": 6.767619878448783e-07, "loss": 0.0019, "num_tokens": 191845657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2683 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8946666666666667, "grad_norm": 1.597170173894824e-09, "kl": 0.040771484375, "learning_rate": 6.725606070147006e-07, "loss": 0.0016, "num_tokens": 191919689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2684 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.895, "grad_norm": 1.2723861964047956e-09, "kl": 0.0491943359375, "learning_rate": 6.683718541854134e-07, "loss": 0.002, "num_tokens": 191993817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2685 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8953333333333333, "grad_norm": 1.399685811698248e-09, "kl": 0.044677734375, "learning_rate": 6.641957350279838e-07, "loss": 0.0018, "num_tokens": 192073257.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2686 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8956666666666667, "grad_norm": 2.2074335692678915e-09, "kl": 0.0433349609375, "learning_rate": 6.60032255196268e-07, "loss": 0.0017, "num_tokens": 192152633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2687 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.896, "grad_norm": 1.1476454231384992e-09, "kl": 0.0440673828125, "learning_rate": 6.558814203270147e-07, "loss": 0.0018, "num_tokens": 192228345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2688 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8963333333333333, "grad_norm": 1.1839437208394088e-09, "kl": 0.04608154296875, "learning_rate": 6.517432360398556e-07, "loss": 0.0018, "num_tokens": 192302889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2689 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8966666666666666, "grad_norm": 1.531577975555365e-09, "kl": 0.0469970703125, "learning_rate": 6.476177079372903e-07, "loss": 0.0019, "num_tokens": 192376761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2690 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.897, "grad_norm": 1.2339079757950344e-09, "kl": 0.0438232421875, "learning_rate": 6.435048416046863e-07, "loss": 0.0018, "num_tokens": 192452553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2691 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8973333333333333, "grad_norm": 1.0130593031121293e-09, "kl": 0.04632568359375, "learning_rate": 6.394046426102673e-07, "loss": 0.0019, "num_tokens": 192526617.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2692 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8976666666666666, "grad_norm": 1.8076048435844427e-09, "kl": 0.0445556640625, "learning_rate": 6.353171165051109e-07, "loss": 0.0018, "num_tokens": 192601721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2693 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.898, "grad_norm": 1.4858033692277672e-09, "kl": 0.04437255859375, "learning_rate": 6.312422688231323e-07, "loss": 0.0018, "num_tokens": 192677961.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2694 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8983333333333333, "grad_norm": 1.174542352266883e-09, "kl": 0.04583740234375, "learning_rate": 6.271801050810856e-07, "loss": 0.0018, "num_tokens": 192752249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2695 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8986666666666666, "grad_norm": 1.7336796442890545e-09, "kl": 0.04986572265625, "learning_rate": 6.231306307785523e-07, "loss": 0.002, "num_tokens": 192828105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2696 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.899, "grad_norm": 1.159254914284702e-09, "kl": 0.0467529296875, "learning_rate": 6.190938513979317e-07, "loss": 0.0019, "num_tokens": 192902025.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2697 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8993333333333333, "grad_norm": 9.70292957092056e-10, "kl": 0.04730224609375, "learning_rate": 6.150697724044407e-07, "loss": 0.0019, "num_tokens": 192976777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2698 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8996666666666666, "grad_norm": 2.9080164942740794e-09, "kl": 0.0467529296875, "learning_rate": 6.110583992460984e-07, "loss": 0.0019, "num_tokens": 193054809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2699 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9, "grad_norm": 1.814270844668897e-09, "kl": 0.04632568359375, "learning_rate": 6.070597373537201e-07, "loss": 0.0019, "num_tokens": 193136345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2700 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9003333333333333, "grad_norm": 1.4713595897219989e-09, "kl": 0.046142578125, "learning_rate": 6.030737921409169e-07, "loss": 0.0018, "num_tokens": 193214361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2701 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9006666666666666, "grad_norm": 1.4805883186141955e-09, "kl": 0.04803466796875, "learning_rate": 5.991005690040797e-07, "loss": 0.0019, "num_tokens": 193290985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2702 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.901, "grad_norm": 1.622341261331428e-09, "kl": 0.04644775390625, "learning_rate": 5.951400733223766e-07, "loss": 0.0019, "num_tokens": 193366473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2703 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9013333333333333, "grad_norm": 1.7025219012367643e-09, "kl": 0.046142578125, "learning_rate": 5.911923104577455e-07, "loss": 0.0018, "num_tokens": 193442265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2704 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9016666666666666, "grad_norm": 1.489463663517654e-09, "kl": 0.0452880859375, "learning_rate": 5.872572857548853e-07, "loss": 0.0018, "num_tokens": 193517129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2705 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.902, "grad_norm": 1.3886027883103225e-09, "kl": 0.04571533203125, "learning_rate": 5.833350045412478e-07, "loss": 0.0018, "num_tokens": 193592905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2706 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9023333333333333, "grad_norm": 1.3443077762076427e-09, "kl": 0.04425048828125, "learning_rate": 5.794254721270331e-07, "loss": 0.0018, "num_tokens": 193670201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2707 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9026666666666666, "grad_norm": 1.6732656371587495e-09, "kl": 0.0465087890625, "learning_rate": 5.75528693805183e-07, "loss": 0.0019, "num_tokens": 193746153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2708 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.903, "grad_norm": 2.249676001042644e-09, "kl": 0.04803466796875, "learning_rate": 5.716446748513682e-07, "loss": 0.0019, "num_tokens": 193824537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2709 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9033333333333333, "grad_norm": 1.4147114590912224e-09, "kl": 0.0478515625, "learning_rate": 5.677734205239904e-07, "loss": 0.0019, "num_tokens": 193899513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2710 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9036666666666666, "grad_norm": 2.226965722940122e-09, "kl": 0.047607421875, "learning_rate": 5.63914936064165e-07, "loss": 0.0019, "num_tokens": 193977161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2711 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.904, "grad_norm": 1.8142483071414972e-09, "kl": 0.0467529296875, "learning_rate": 5.600692266957208e-07, "loss": 0.0019, "num_tokens": 194053017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2712 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9043333333333333, "grad_norm": 1.6519959844529808e-09, "kl": 0.046142578125, "learning_rate": 5.562362976251901e-07, "loss": 0.0018, "num_tokens": 194129113.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2713 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9046666666666666, "grad_norm": 1.6164379834648912e-09, "kl": 0.04547119140625, "learning_rate": 5.524161540418039e-07, "loss": 0.0018, "num_tokens": 194205225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2714 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.905, "grad_norm": 1.5143343246037944e-09, "kl": 0.04541015625, "learning_rate": 5.48608801117485e-07, "loss": 0.0018, "num_tokens": 194280681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2715 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9053333333333333, "grad_norm": 1.825068540739494e-09, "kl": 0.0447998046875, "learning_rate": 5.448142440068316e-07, "loss": 0.0018, "num_tokens": 194354857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2716 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9056666666666666, "grad_norm": 9.96274396314334e-10, "kl": 0.04925537109375, "learning_rate": 5.410324878471296e-07, "loss": 0.002, "num_tokens": 194428601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2717 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.906, "grad_norm": 8.817634955526898e-10, "kl": 0.04949951171875, "learning_rate": 5.37263537758328e-07, "loss": 0.002, "num_tokens": 194502649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2718 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9063333333333333, "grad_norm": 1.4872045817071466e-09, "kl": 0.04766845703125, "learning_rate": 5.335073988430373e-07, "loss": 0.0019, "num_tokens": 194579481.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2719 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9066666666666666, "grad_norm": 1.5394499008891671e-09, "kl": 0.045166015625, "learning_rate": 5.297640761865242e-07, "loss": 0.0018, "num_tokens": 194655337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2720 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.907, "grad_norm": 1.8699926052079263e-09, "kl": 0.0457763671875, "learning_rate": 5.26033574856708e-07, "loss": 0.0018, "num_tokens": 194730265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2721 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9073333333333333, "grad_norm": 1.7702224130999866e-09, "kl": 0.0501708984375, "learning_rate": 5.223158999041444e-07, "loss": 0.002, "num_tokens": 194806393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2722 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9076666666666666, "grad_norm": 2.0958821345118395e-09, "kl": 0.046630859375, "learning_rate": 5.18611056362025e-07, "loss": 0.0019, "num_tokens": 194882057.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2723 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.908, "grad_norm": 2.8289557363336826e-09, "kl": 0.04193115234375, "learning_rate": 5.149190492461753e-07, "loss": 0.0017, "num_tokens": 194959673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2724 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9083333333333333, "grad_norm": 1.5199149716593752e-09, "kl": 0.04644775390625, "learning_rate": 5.112398835550348e-07, "loss": 0.0019, "num_tokens": 195032553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2725 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9086666666666666, "grad_norm": 1.6076604492099023e-09, "kl": 0.0457763671875, "learning_rate": 5.075735642696611e-07, "loss": 0.0018, "num_tokens": 195110185.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2726 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.909, "grad_norm": 2.8266640139662513e-09, "kl": 0.0478515625, "learning_rate": 5.039200963537194e-07, "loss": 0.0019, "num_tokens": 195187513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2727 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9093333333333333, "grad_norm": 1.8253267786150218e-09, "kl": 0.04547119140625, "learning_rate": 5.002794847534765e-07, "loss": 0.0018, "num_tokens": 195267353.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2728 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9096666666666666, "grad_norm": 1.1923446674444449e-09, "kl": 0.04266357421875, "learning_rate": 4.966517343977884e-07, "loss": 0.0017, "num_tokens": 195341513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2729 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.91, "grad_norm": 1.3433225642955904e-09, "kl": 0.04498291015625, "learning_rate": 4.930368501981097e-07, "loss": 0.0018, "num_tokens": 195417529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2730 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9103333333333333, "grad_norm": 1.479961930783702e-09, "kl": 0.04742431640625, "learning_rate": 4.894348370484648e-07, "loss": 0.0019, "num_tokens": 195493001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2731 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9106666666666666, "grad_norm": 1.5919801032993064e-09, "kl": 0.04547119140625, "learning_rate": 4.858456998254591e-07, "loss": 0.0018, "num_tokens": 195569241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2732 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.911, "grad_norm": 1.924354453564092e-09, "kl": 0.04046630859375, "learning_rate": 4.822694433882635e-07, "loss": 0.0016, "num_tokens": 195646089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2733 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9113333333333333, "grad_norm": 1.4513840129737332e-09, "kl": 0.04498291015625, "learning_rate": 4.787060725786141e-07, "loss": 0.0018, "num_tokens": 195720681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2734 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9116666666666666, "grad_norm": 1.7920511741209566e-09, "kl": 0.0455322265625, "learning_rate": 4.75155592220794e-07, "loss": 0.0018, "num_tokens": 195795529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2735 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.912, "grad_norm": 1.3332354109607536e-09, "kl": 0.0465087890625, "learning_rate": 4.7161800712163807e-07, "loss": 0.0019, "num_tokens": 195872105.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2736 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9123333333333333, "grad_norm": 2.0897301666877866e-09, "kl": 0.0477294921875, "learning_rate": 4.6809332207053083e-07, "loss": 0.0019, "num_tokens": 195948873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2737 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9126666666666666, "grad_norm": 1.4148687776938118e-09, "kl": 0.046630859375, "learning_rate": 4.6458154183937733e-07, "loss": 0.0019, "num_tokens": 196023193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2738 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.913, "grad_norm": 1.2989350706149594e-09, "kl": 0.046875, "learning_rate": 4.6108267118262327e-07, "loss": 0.0019, "num_tokens": 196097017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2739 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9133333333333333, "grad_norm": 1.7007215635800321e-09, "kl": 0.0482177734375, "learning_rate": 4.575967148372318e-07, "loss": 0.0019, "num_tokens": 196172569.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2740 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9136666666666666, "grad_norm": 2.5038797701881776e-09, "kl": 0.04400634765625, "learning_rate": 4.5412367752268094e-07, "loss": 0.0018, "num_tokens": 196250921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2741 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.914, "grad_norm": 1.3459204861732132e-09, "kl": 0.04351806640625, "learning_rate": 4.506635639409607e-07, "loss": 0.0017, "num_tokens": 196329817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2742 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9143333333333333, "grad_norm": 1.898086576801461e-09, "kl": 0.04833984375, "learning_rate": 4.4721637877656377e-07, "loss": 0.0019, "num_tokens": 196407385.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2743 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9146666666666666, "grad_norm": 8.262276973702853e-10, "kl": 0.043212890625, "learning_rate": 4.4378212669647814e-07, "loss": 0.0017, "num_tokens": 196481785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2744 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.915, "grad_norm": 1.3059461290154672e-09, "kl": 0.04620361328125, "learning_rate": 4.4036081235018347e-07, "loss": 0.0018, "num_tokens": 196556665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2745 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9153333333333333, "grad_norm": 2.2721404757675145e-09, "kl": 0.04449462890625, "learning_rate": 4.3695244036964567e-07, "loss": 0.0018, "num_tokens": 196634153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2746 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9156666666666666, "grad_norm": 2.4467967651986555e-09, "kl": 0.04669189453125, "learning_rate": 4.335570153693036e-07, "loss": 0.0019, "num_tokens": 196713465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2747 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.916, "grad_norm": 1.9677999230083287e-09, "kl": 0.04693603515625, "learning_rate": 4.301745419460712e-07, "loss": 0.0019, "num_tokens": 196789513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2748 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9163333333333333, "grad_norm": 1.5940316844265112e-09, "kl": 0.041748046875, "learning_rate": 4.268050246793276e-07, "loss": 0.0017, "num_tokens": 196863881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2749 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9166666666666666, "grad_norm": 2.232148466063677e-09, "kl": 0.04779052734375, "learning_rate": 4.234484681309103e-07, "loss": 0.0019, "num_tokens": 196941273.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2750 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.917, "grad_norm": 1.4069904130664668e-09, "kl": 0.04315185546875, "learning_rate": 4.2010487684511105e-07, "loss": 0.0017, "num_tokens": 197015753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2751 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9173333333333333, "grad_norm": 1.3602932114054056e-09, "kl": 0.04486083984375, "learning_rate": 4.167742553486676e-07, "loss": 0.0018, "num_tokens": 197091289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2752 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9176666666666666, "grad_norm": 1.836065743887616e-09, "kl": 0.0482177734375, "learning_rate": 4.134566081507585e-07, "loss": 0.0019, "num_tokens": 197166537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2753 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.918, "grad_norm": 8.54634196745252e-10, "kl": 0.0474853515625, "learning_rate": 4.101519397429976e-07, "loss": 0.0019, "num_tokens": 197240089.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2754 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9183333333333333, "grad_norm": 1.972982222042674e-09, "kl": 0.04437255859375, "learning_rate": 4.068602545994249e-07, "loss": 0.0018, "num_tokens": 197317497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2755 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9186666666666666, "grad_norm": 2.528717013561277e-09, "kl": 0.04681396484375, "learning_rate": 4.035815571765089e-07, "loss": 0.0019, "num_tokens": 197394905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2756 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.919, "grad_norm": 1.4672819625971556e-09, "kl": 0.04595947265625, "learning_rate": 4.003158519131245e-07, "loss": 0.0018, "num_tokens": 197471625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2757 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9193333333333333, "grad_norm": 1.781969127812033e-09, "kl": 0.0469970703125, "learning_rate": 3.9706314323056936e-07, "loss": 0.0019, "num_tokens": 197548681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2758 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9196666666666666, "grad_norm": 1.5447974011095766e-09, "kl": 0.04522705078125, "learning_rate": 3.9382343553253764e-07, "loss": 0.0018, "num_tokens": 197625481.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2759 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.92, "grad_norm": 1.4897868494401223e-09, "kl": 0.04791259765625, "learning_rate": 3.905967332051219e-07, "loss": 0.0019, "num_tokens": 197700649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2760 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9203333333333333, "grad_norm": 1.018335527014358e-09, "kl": 0.04583740234375, "learning_rate": 3.8738304061681107e-07, "loss": 0.0018, "num_tokens": 197773625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2761 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9206666666666666, "grad_norm": 1.5124627106288813e-09, "kl": 0.046142578125, "learning_rate": 3.8418236211848147e-07, "loss": 0.0018, "num_tokens": 197849769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2762 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.921, "grad_norm": 1.9083337132741462e-09, "kl": 0.0487060546875, "learning_rate": 3.809947020433824e-07, "loss": 0.0019, "num_tokens": 197926201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2763 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9213333333333333, "grad_norm": 1.0834209085430757e-09, "kl": 0.0426025390625, "learning_rate": 3.7782006470714614e-07, "loss": 0.0017, "num_tokens": 198000489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2764 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9216666666666666, "grad_norm": 1.6291868965012668e-09, "kl": 0.0426025390625, "learning_rate": 3.746584544077736e-07, "loss": 0.0017, "num_tokens": 198075593.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2765 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.922, "grad_norm": 1.491265666508923e-09, "kl": 0.04559326171875, "learning_rate": 3.715098754256241e-07, "loss": 0.0018, "num_tokens": 198150329.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2766 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9223333333333333, "grad_norm": 1.621741851920433e-09, "kl": 0.0455322265625, "learning_rate": 3.68374332023419e-07, "loss": 0.0018, "num_tokens": 198224953.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2767 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9226666666666666, "grad_norm": 1.5805363684506801e-09, "kl": 0.0438232421875, "learning_rate": 3.65251828446227e-07, "loss": 0.0018, "num_tokens": 198298169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2768 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.923, "grad_norm": 1.1675860278614891e-09, "kl": 0.04052734375, "learning_rate": 3.6214236892146983e-07, "loss": 0.0016, "num_tokens": 198372745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2769 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9233333333333333, "grad_norm": 1.20636134415264e-09, "kl": 0.04632568359375, "learning_rate": 3.590459576589e-07, "loss": 0.0019, "num_tokens": 198447657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2770 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9236666666666666, "grad_norm": 1.1482861328460103e-09, "kl": 0.04736328125, "learning_rate": 3.55962598850611e-07, "loss": 0.0019, "num_tokens": 198522361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2771 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.924, "grad_norm": 1.812141880996876e-09, "kl": 0.0443115234375, "learning_rate": 3.5289229667102463e-07, "loss": 0.0018, "num_tokens": 198598393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2772 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9243333333333333, "grad_norm": 1.741155108980763e-09, "kl": 0.04730224609375, "learning_rate": 3.498350552768859e-07, "loss": 0.0019, "num_tokens": 198674265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2773 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9246666666666666, "grad_norm": 2.4794506447989306e-09, "kl": 0.048583984375, "learning_rate": 3.467908788072538e-07, "loss": 0.0019, "num_tokens": 198749193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2774 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.925, "grad_norm": 1.3046874691724497e-09, "kl": 0.04547119140625, "learning_rate": 3.4375977138350615e-07, "loss": 0.0018, "num_tokens": 198825193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2775 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9253333333333333, "grad_norm": 2.1103452318982363e-09, "kl": 0.04705810546875, "learning_rate": 3.4074173710931804e-07, "loss": 0.0019, "num_tokens": 198901577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2776 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9256666666666666, "grad_norm": 1.3667855736088086e-09, "kl": 0.04852294921875, "learning_rate": 3.377367800706732e-07, "loss": 0.0019, "num_tokens": 198974889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2777 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.926, "grad_norm": 1.7653651873672516e-09, "kl": 0.04608154296875, "learning_rate": 3.347449043358475e-07, "loss": 0.0018, "num_tokens": 199051529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2778 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9263333333333333, "grad_norm": 1.382111536329944e-09, "kl": 0.041748046875, "learning_rate": 3.3176611395540625e-07, "loss": 0.0017, "num_tokens": 199125481.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2779 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9266666666666666, "grad_norm": 2.1750283796251324e-09, "kl": 0.044921875, "learning_rate": 3.288004129622013e-07, "loss": 0.0018, "num_tokens": 199203129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2780 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.927, "grad_norm": 1.929479465090367e-09, "kl": 0.04644775390625, "learning_rate": 3.2584780537136206e-07, "loss": 0.0019, "num_tokens": 199278489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2781 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9273333333333333, "grad_norm": 3.5863938485647395e-09, "kl": 0.04779052734375, "learning_rate": 3.2290829518028867e-07, "loss": 0.0019, "num_tokens": 199356841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2782 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9276666666666666, "grad_norm": 1.3098312434678405e-09, "kl": 0.0501708984375, "learning_rate": 3.1998188636865325e-07, "loss": 0.002, "num_tokens": 199431001.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2783 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.928, "grad_norm": 1.1819325518303003e-09, "kl": 0.04339599609375, "learning_rate": 3.1706858289838994e-07, "loss": 0.0017, "num_tokens": 199506969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2784 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9283333333333333, "grad_norm": 1.0809571016068276e-09, "kl": 0.044189453125, "learning_rate": 3.1416838871368925e-07, "loss": 0.0018, "num_tokens": 199582249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2785 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9286666666666666, "grad_norm": 2.2859372172945314e-09, "kl": 0.047607421875, "learning_rate": 3.112813077409926e-07, "loss": 0.0019, "num_tokens": 199658137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2786 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.929, "grad_norm": 2.359157980080795e-09, "kl": 0.0445556640625, "learning_rate": 3.0840734388898897e-07, "loss": 0.0018, "num_tokens": 199737033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2787 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9293333333333333, "grad_norm": 2.253692565901133e-09, "kl": 0.0445556640625, "learning_rate": 3.0554650104861137e-07, "loss": 0.0018, "num_tokens": 199813897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2788 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9296666666666666, "grad_norm": 9.50005962785383e-10, "kl": 0.04217529296875, "learning_rate": 3.026987830930239e-07, "loss": 0.0017, "num_tokens": 199888921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2789 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.93, "grad_norm": 2.291213663241365e-09, "kl": 0.04705810546875, "learning_rate": 2.9986419387762365e-07, "loss": 0.0019, "num_tokens": 199965993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2790 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9303333333333333, "grad_norm": 1.5451888657480595e-09, "kl": 0.04534912109375, "learning_rate": 2.970427372400353e-07, "loss": 0.0018, "num_tokens": 200042409.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2791 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9306666666666666, "grad_norm": 1.9826431607583572e-09, "kl": 0.04315185546875, "learning_rate": 2.94234417000101e-07, "loss": 0.0017, "num_tokens": 200117033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2792 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.931, "grad_norm": 1.260189841367776e-09, "kl": 0.04498291015625, "learning_rate": 2.9143923695987955e-07, "loss": 0.0018, "num_tokens": 200191673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2793 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9313333333333333, "grad_norm": 1.8147545688407263e-09, "kl": 0.0509033203125, "learning_rate": 2.8865720090364037e-07, "loss": 0.002, "num_tokens": 200267289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2794 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9316666666666666, "grad_norm": 2.1262047678050067e-09, "kl": 0.04559326171875, "learning_rate": 2.858883125978551e-07, "loss": 0.0018, "num_tokens": 200340969.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2795 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.932, "grad_norm": 1.1869067950698309e-09, "kl": 0.04559326171875, "learning_rate": 2.831325757911985e-07, "loss": 0.0018, "num_tokens": 200418809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2796 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9323333333333333, "grad_norm": 2.5276900572634986e-09, "kl": 0.0474853515625, "learning_rate": 2.8038999421453827e-07, "loss": 0.0019, "num_tokens": 200495081.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2797 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9326666666666666, "grad_norm": 1.8831241010985877e-09, "kl": 0.04547119140625, "learning_rate": 2.7766057158093217e-07, "loss": 0.0018, "num_tokens": 200571193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2798 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.933, "grad_norm": 2.656353359498098e-09, "kl": 0.04669189453125, "learning_rate": 2.749443115856232e-07, "loss": 0.0019, "num_tokens": 200648345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2799 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9333333333333333, "grad_norm": 2.2433330748583558e-09, "kl": 0.047119140625, "learning_rate": 2.7224121790603517e-07, "loss": 0.0019, "num_tokens": 200725449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2800 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9336666666666666, "grad_norm": 1.7229500048898672e-09, "kl": 0.03826904296875, "learning_rate": 2.6955129420176193e-07, "loss": 0.0015, "num_tokens": 200799817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2801 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.934, "grad_norm": 2.238356833217381e-09, "kl": 0.0465087890625, "learning_rate": 2.6687454411457256e-07, "loss": 0.0019, "num_tokens": 200876441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2802 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9343333333333333, "grad_norm": 1.3484483529779823e-09, "kl": 0.047119140625, "learning_rate": 2.6421097126839714e-07, "loss": 0.0019, "num_tokens": 200952649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2803 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9346666666666666, "grad_norm": 1.4187280239497113e-09, "kl": 0.04461669921875, "learning_rate": 2.6156057926932985e-07, "loss": 0.0018, "num_tokens": 201028633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2804 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.935, "grad_norm": 1.1227349050457747e-09, "kl": 0.047607421875, "learning_rate": 2.589233717056128e-07, "loss": 0.0019, "num_tokens": 201103385.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2805 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9353333333333333, "grad_norm": 9.553203561551982e-09, "kl": 0.042236328125, "learning_rate": 2.5629935214764866e-07, "loss": 0.0017, "num_tokens": 201184249.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2806 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9356666666666666, "grad_norm": 1.542145300348352e-09, "kl": 0.0440673828125, "learning_rate": 2.536885241479736e-07, "loss": 0.0018, "num_tokens": 201261801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2807 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.936, "grad_norm": 1.7628082327192374e-09, "kl": 0.04620361328125, "learning_rate": 2.510908912412746e-07, "loss": 0.0018, "num_tokens": 201336425.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2808 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9363333333333334, "grad_norm": 9.953243784721622e-10, "kl": 0.04473876953125, "learning_rate": 2.4850645694436736e-07, "loss": 0.0018, "num_tokens": 201410137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2809 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9366666666666666, "grad_norm": 2.7259841051119338e-09, "kl": 0.0478515625, "learning_rate": 2.4593522475620415e-07, "loss": 0.0019, "num_tokens": 201487641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2810 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.937, "grad_norm": 2.234079365948105e-09, "kl": 0.0472412109375, "learning_rate": 2.433771981578581e-07, "loss": 0.0019, "num_tokens": 201564377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2811 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9373333333333334, "grad_norm": 2.9833353565322795e-09, "kl": 0.04608154296875, "learning_rate": 2.4083238061252565e-07, "loss": 0.0018, "num_tokens": 201641337.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2812 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9376666666666666, "grad_norm": 1.631946910940485e-09, "kl": 0.04736328125, "learning_rate": 2.3830077556552424e-07, "loss": 0.0019, "num_tokens": 201717689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2813 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.938, "grad_norm": 1.9443995302736994e-09, "kl": 0.0460205078125, "learning_rate": 2.3578238644427763e-07, "loss": 0.0018, "num_tokens": 201792985.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2814 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9383333333333334, "grad_norm": 1.3453519409623027e-09, "kl": 0.04547119140625, "learning_rate": 2.332772166583208e-07, "loss": 0.0018, "num_tokens": 201867145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2815 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9386666666666666, "grad_norm": 1.4726727615155255e-09, "kl": 0.0496826171875, "learning_rate": 2.307852695992907e-07, "loss": 0.002, "num_tokens": 201941385.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2816 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.939, "grad_norm": 1.1133699517884565e-09, "kl": 0.0416259765625, "learning_rate": 2.2830654864092083e-07, "loss": 0.0017, "num_tokens": 202016377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2817 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9393333333333334, "grad_norm": 1.8193979656189185e-09, "kl": 0.04669189453125, "learning_rate": 2.2584105713904126e-07, "loss": 0.0019, "num_tokens": 202091865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2818 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9396666666666667, "grad_norm": 1.4408650939046197e-09, "kl": 0.0439453125, "learning_rate": 2.233887984315697e-07, "loss": 0.0018, "num_tokens": 202166857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2819 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.94, "grad_norm": 1.6705423711016465e-09, "kl": 0.04229736328125, "learning_rate": 2.209497758385104e-07, "loss": 0.0017, "num_tokens": 202243161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2820 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9403333333333334, "grad_norm": 1.6784451606355333e-09, "kl": 0.04595947265625, "learning_rate": 2.1852399266194312e-07, "loss": 0.0018, "num_tokens": 202318217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2821 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9406666666666667, "grad_norm": 1.1895961993246829e-09, "kl": 0.0401611328125, "learning_rate": 2.161114521860308e-07, "loss": 0.0016, "num_tokens": 202393801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2822 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.941, "grad_norm": 1.2220880973856652e-09, "kl": 0.04547119140625, "learning_rate": 2.137121576770007e-07, "loss": 0.0018, "num_tokens": 202469145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2823 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9413333333333334, "grad_norm": 1.229170432104354e-09, "kl": 0.04510498046875, "learning_rate": 2.1132611238315004e-07, "loss": 0.0018, "num_tokens": 202543769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2824 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9416666666666667, "grad_norm": 2.06150896353563e-09, "kl": 0.04705810546875, "learning_rate": 2.089533195348392e-07, "loss": 0.0019, "num_tokens": 202619321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2825 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.942, "grad_norm": 1.8419480385389875e-09, "kl": 0.0477294921875, "learning_rate": 2.0659378234448524e-07, "loss": 0.0019, "num_tokens": 202697465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2826 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9423333333333334, "grad_norm": 1.99699923264518e-09, "kl": 0.046630859375, "learning_rate": 2.0424750400655947e-07, "loss": 0.0019, "num_tokens": 202774009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2827 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9426666666666667, "grad_norm": 1.5248219353836134e-09, "kl": 0.0469970703125, "learning_rate": 2.0191448769758315e-07, "loss": 0.0019, "num_tokens": 202850585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2828 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.943, "grad_norm": 1.2909239233138692e-09, "kl": 0.04510498046875, "learning_rate": 1.9959473657612193e-07, "loss": 0.0018, "num_tokens": 202924697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2829 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9433333333333334, "grad_norm": 2.3433068818690117e-09, "kl": 0.0450439453125, "learning_rate": 1.9728825378278248e-07, "loss": 0.0018, "num_tokens": 203001513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2830 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9436666666666667, "grad_norm": 1.8256807177152723e-09, "kl": 0.04425048828125, "learning_rate": 1.9499504244020694e-07, "loss": 0.0018, "num_tokens": 203078473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2831 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.944, "grad_norm": 1.4050293151157689e-09, "kl": 0.038543701171875, "learning_rate": 1.9271510565307405e-07, "loss": 0.0015, "num_tokens": 203158313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2832 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9443333333333334, "grad_norm": 1.134748295328336e-09, "kl": 0.04412841796875, "learning_rate": 1.9044844650808468e-07, "loss": 0.0018, "num_tokens": 203232505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2833 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9446666666666667, "grad_norm": 1.644116731647216e-09, "kl": 0.047607421875, "learning_rate": 1.8819506807396748e-07, "loss": 0.0019, "num_tokens": 203309881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2834 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.945, "grad_norm": 2.5901472078260213e-09, "kl": 0.04547119140625, "learning_rate": 1.8595497340147316e-07, "loss": 0.0018, "num_tokens": 203384505.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2835 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9453333333333334, "grad_norm": 1.6057630780608179e-09, "kl": 0.0477294921875, "learning_rate": 1.8372816552336025e-07, "loss": 0.0019, "num_tokens": 203460361.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2836 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9456666666666667, "grad_norm": 1.5923178331433974e-09, "kl": 0.045166015625, "learning_rate": 1.8151464745440828e-07, "loss": 0.0018, "num_tokens": 203537129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2837 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.946, "grad_norm": 1.0007723538763003e-09, "kl": 0.04400634765625, "learning_rate": 1.793144221913967e-07, "loss": 0.0018, "num_tokens": 203610777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2838 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9463333333333334, "grad_norm": 1.3451804115049981e-09, "kl": 0.04827880859375, "learning_rate": 1.7712749271311392e-07, "loss": 0.0019, "num_tokens": 203686825.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2839 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9466666666666667, "grad_norm": 1.4056300567943936e-09, "kl": 0.04254150390625, "learning_rate": 1.7495386198034258e-07, "loss": 0.0017, "num_tokens": 203762345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2840 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.947, "grad_norm": 1.567531437984826e-09, "kl": 0.0479736328125, "learning_rate": 1.7279353293586765e-07, "loss": 0.0019, "num_tokens": 203837881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2841 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9473333333333334, "grad_norm": 1.19234444539984e-09, "kl": 0.04571533203125, "learning_rate": 1.706465085044584e-07, "loss": 0.0018, "num_tokens": 203916169.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2842 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9476666666666667, "grad_norm": 1.7183039435764158e-09, "kl": 0.0445556640625, "learning_rate": 1.6851279159287526e-07, "loss": 0.0018, "num_tokens": 203991529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2843 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.948, "grad_norm": 1.2892065193170765e-09, "kl": 0.04473876953125, "learning_rate": 1.6639238508986188e-07, "loss": 0.0018, "num_tokens": 204067369.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2844 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9483333333333334, "grad_norm": 1.4408597648341015e-09, "kl": 0.047119140625, "learning_rate": 1.6428529186614195e-07, "loss": 0.0019, "num_tokens": 204141913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2845 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9486666666666667, "grad_norm": 1.7149521802650725e-09, "kl": 0.04803466796875, "learning_rate": 1.6219151477441243e-07, "loss": 0.0019, "num_tokens": 204217545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2846 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.949, "grad_norm": 3.3588483105972955e-09, "kl": 0.04742431640625, "learning_rate": 1.601110566493458e-07, "loss": 0.0019, "num_tokens": 204296153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2847 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9493333333333334, "grad_norm": 1.1913562358856211e-09, "kl": 0.0419921875, "learning_rate": 1.580439203075812e-07, "loss": 0.0017, "num_tokens": 204370281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2848 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9496666666666667, "grad_norm": 1.0094374225388947e-09, "kl": 0.045166015625, "learning_rate": 1.5599010854772002e-07, "loss": 0.0018, "num_tokens": 204445161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2849 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.95, "grad_norm": 1.824966289198926e-09, "kl": 0.0467529296875, "learning_rate": 1.5394962415032578e-07, "loss": 0.0019, "num_tokens": 204521017.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2850 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9503333333333334, "grad_norm": 2.020463352181423e-09, "kl": 0.04803466796875, "learning_rate": 1.519224698779198e-07, "loss": 0.0019, "num_tokens": 204596201.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2851 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9506666666666667, "grad_norm": 1.8384758160294723e-09, "kl": 0.0477294921875, "learning_rate": 1.4990864847497456e-07, "loss": 0.0019, "num_tokens": 204671193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2852 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.951, "grad_norm": 2.270664323233973e-09, "kl": 0.0450439453125, "learning_rate": 1.4790816266791018e-07, "loss": 0.0018, "num_tokens": 204746265.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2853 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9513333333333334, "grad_norm": 1.0415018847353963e-09, "kl": 0.0452880859375, "learning_rate": 1.4592101516509916e-07, "loss": 0.0018, "num_tokens": 204824121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2854 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9516666666666667, "grad_norm": 2.003548438267444e-09, "kl": 0.04718017578125, "learning_rate": 1.4394720865684718e-07, "loss": 0.0019, "num_tokens": 204900937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2855 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.952, "grad_norm": 1.874883803765215e-09, "kl": 0.0455322265625, "learning_rate": 1.419867458154034e-07, "loss": 0.0018, "num_tokens": 204978665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2856 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9523333333333334, "grad_norm": 1.3125730502494548e-09, "kl": 0.04522705078125, "learning_rate": 1.400396292949513e-07, "loss": 0.0018, "num_tokens": 205055305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2857 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9526666666666667, "grad_norm": 2.379470176450127e-09, "kl": 0.04522705078125, "learning_rate": 1.3810586173160224e-07, "loss": 0.0018, "num_tokens": 205132313.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2858 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.953, "grad_norm": 1.365435209343957e-09, "kl": 0.04461669921875, "learning_rate": 1.3618544574339976e-07, "loss": 0.0018, "num_tokens": 205208585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2859 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9533333333333334, "grad_norm": 2.9286539859896266e-09, "kl": 0.04425048828125, "learning_rate": 1.3427838393030634e-07, "loss": 0.0018, "num_tokens": 205286121.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2860 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9536666666666667, "grad_norm": 1.1676538624882937e-09, "kl": 0.04595947265625, "learning_rate": 1.323846788742078e-07, "loss": 0.0018, "num_tokens": 205361657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2861 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.954, "grad_norm": 1.8650967437139343e-09, "kl": 0.04559326171875, "learning_rate": 1.3050433313890774e-07, "loss": 0.0018, "num_tokens": 205436889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2862 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9543333333333334, "grad_norm": 1.7542512997792414e-09, "kl": 0.04840087890625, "learning_rate": 1.2863734927012094e-07, "loss": 0.0019, "num_tokens": 205511801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2863 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9546666666666667, "grad_norm": 1.8528545364659976e-09, "kl": 0.0452880859375, "learning_rate": 1.2678372979547326e-07, "loss": 0.0018, "num_tokens": 205588649.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2864 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.955, "grad_norm": 1.8584306316071775e-09, "kl": 0.04400634765625, "learning_rate": 1.2494347722449506e-07, "loss": 0.0018, "num_tokens": 205663849.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2865 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9553333333333334, "grad_norm": 9.341071249835409e-10, "kl": 0.04730224609375, "learning_rate": 1.231165940486234e-07, "loss": 0.0019, "num_tokens": 205739769.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2866 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9556666666666667, "grad_norm": 2.132524601350383e-09, "kl": 0.0435791015625, "learning_rate": 1.2130308274119207e-07, "loss": 0.0017, "num_tokens": 205815513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2867 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.956, "grad_norm": 2.035346113871128e-09, "kl": 0.04541015625, "learning_rate": 1.1950294575743372e-07, "loss": 0.0018, "num_tokens": 205891225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2868 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9563333333333334, "grad_norm": 1.5192248570272682e-09, "kl": 0.04150390625, "learning_rate": 1.1771618553447217e-07, "loss": 0.0017, "num_tokens": 205971609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2869 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9566666666666667, "grad_norm": 1.6769144961514826e-09, "kl": 0.0489501953125, "learning_rate": 1.1594280449132245e-07, "loss": 0.002, "num_tokens": 206046729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2870 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.957, "grad_norm": 1.3430211387444047e-09, "kl": 0.043212890625, "learning_rate": 1.1418280502888401e-07, "loss": 0.0017, "num_tokens": 206119801.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2871 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9573333333333334, "grad_norm": 1.8542503088525564e-09, "kl": 0.044189453125, "learning_rate": 1.1243618952994195e-07, "loss": 0.0018, "num_tokens": 206195625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2872 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9576666666666667, "grad_norm": 1.0374904269028207e-09, "kl": 0.04833984375, "learning_rate": 1.1070296035916028e-07, "loss": 0.0019, "num_tokens": 206270905.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2873 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.958, "grad_norm": 1.4874067533199309e-09, "kl": 0.0458984375, "learning_rate": 1.0898311986307975e-07, "loss": 0.0018, "num_tokens": 206345721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2874 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9583333333333334, "grad_norm": 1.2928865755768015e-09, "kl": 0.04388427734375, "learning_rate": 1.0727667037011668e-07, "loss": 0.0018, "num_tokens": 206419321.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2875 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9586666666666667, "grad_norm": 1.1247179854123601e-09, "kl": 0.0517578125, "learning_rate": 1.055836141905553e-07, "loss": 0.0021, "num_tokens": 206495753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2876 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.959, "grad_norm": 1.0328591315555968e-09, "kl": 0.04681396484375, "learning_rate": 1.039039536165476e-07, "loss": 0.0019, "num_tokens": 206569737.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2877 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9593333333333334, "grad_norm": 1.6002604796838682e-09, "kl": 0.047119140625, "learning_rate": 1.0223769092211012e-07, "loss": 0.0019, "num_tokens": 206645497.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2878 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9596666666666667, "grad_norm": 1.1132157418103361e-09, "kl": 0.04376220703125, "learning_rate": 1.0058482836312278e-07, "loss": 0.0018, "num_tokens": 206718857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2879 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.96, "grad_norm": 1.2266860860421502e-09, "kl": 0.04254150390625, "learning_rate": 9.894536817732226e-08, "loss": 0.0017, "num_tokens": 206795097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2880 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9603333333333334, "grad_norm": 1.0729875876691608e-09, "kl": 0.04473876953125, "learning_rate": 9.731931258429638e-08, "loss": 0.0018, "num_tokens": 206872473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2881 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9606666666666667, "grad_norm": 2.1439465758277265e-09, "kl": 0.0478515625, "learning_rate": 9.57066637854931e-08, "loss": 0.0019, "num_tokens": 206949609.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2882 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.961, "grad_norm": 1.2095711099391337e-09, "kl": 0.046142578125, "learning_rate": 9.410742396420259e-08, "loss": 0.0018, "num_tokens": 207024793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2883 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9613333333333334, "grad_norm": 1.4314726071162909e-09, "kl": 0.04443359375, "learning_rate": 9.252159528556404e-08, "loss": 0.0018, "num_tokens": 207100761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2884 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9616666666666667, "grad_norm": 2.614742644624357e-09, "kl": 0.04522705078125, "learning_rate": 9.094917989656005e-08, "loss": 0.0018, "num_tokens": 207177289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2885 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.962, "grad_norm": 1.0771308289747594e-09, "kl": 0.0433349609375, "learning_rate": 8.939017992601329e-08, "loss": 0.0017, "num_tokens": 207252729.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2886 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9623333333333334, "grad_norm": 1.5476608883346898e-09, "kl": 0.05035400390625, "learning_rate": 8.784459748458318e-08, "loss": 0.002, "num_tokens": 207328153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2887 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9626666666666667, "grad_norm": 2.431616463738351e-09, "kl": 0.0460205078125, "learning_rate": 8.631243466476368e-08, "loss": 0.0018, "num_tokens": 207404345.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2888 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.963, "grad_norm": 1.7921398809406242e-09, "kl": 0.04473876953125, "learning_rate": 8.479369354088329e-08, "loss": 0.0018, "num_tokens": 207482153.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2889 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9633333333333334, "grad_norm": 1.2141361249717875e-09, "kl": 0.04571533203125, "learning_rate": 8.328837616909612e-08, "loss": 0.0018, "num_tokens": 207558185.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2890 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9636666666666667, "grad_norm": 1.8377752653009338e-09, "kl": 0.046630859375, "learning_rate": 8.179648458738309e-08, "loss": 0.0019, "num_tokens": 207633193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2891 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.964, "grad_norm": 1.3388734565467075e-09, "kl": 0.047119140625, "learning_rate": 8.031802081554963e-08, "loss": 0.0019, "num_tokens": 207708681.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2892 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9643333333333334, "grad_norm": 1.762052503906375e-09, "kl": 0.0487060546875, "learning_rate": 7.885298685522235e-08, "loss": 0.0019, "num_tokens": 207784809.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2893 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9646666666666667, "grad_norm": 9.543931200894917e-10, "kl": 0.04534912109375, "learning_rate": 7.740138468984249e-08, "loss": 0.0018, "num_tokens": 207859353.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2894 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.965, "grad_norm": 1.5075356518678973e-09, "kl": 0.045654296875, "learning_rate": 7.596321628467129e-08, "loss": 0.0018, "num_tokens": 207935097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2895 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9653333333333334, "grad_norm": 1.4149178495515002e-09, "kl": 0.044921875, "learning_rate": 7.453848358678018e-08, "loss": 0.0018, "num_tokens": 208010857.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2896 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9656666666666667, "grad_norm": 1.5315223533818312e-09, "kl": 0.04779052734375, "learning_rate": 7.31271885250484e-08, "loss": 0.0019, "num_tokens": 208085225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2897 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.966, "grad_norm": 1.2852930941775753e-09, "kl": 0.04315185546875, "learning_rate": 7.17293330101676e-08, "loss": 0.0017, "num_tokens": 208161225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2898 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9663333333333334, "grad_norm": 1.6096177724023164e-09, "kl": 0.0455322265625, "learning_rate": 7.034491893463059e-08, "loss": 0.0018, "num_tokens": 208236185.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2899 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9666666666666667, "grad_norm": 1.301025953637236e-09, "kl": 0.048583984375, "learning_rate": 6.897394817273251e-08, "loss": 0.0019, "num_tokens": 208309817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2900 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.967, "grad_norm": 1.3912451191089303e-09, "kl": 0.0452880859375, "learning_rate": 6.761642258056977e-08, "loss": 0.0018, "num_tokens": 208384601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2901 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9673333333333334, "grad_norm": 1.964226781225875e-09, "kl": 0.04876708984375, "learning_rate": 6.627234399603554e-08, "loss": 0.0019, "num_tokens": 208460553.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2902 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9676666666666667, "grad_norm": 1.1935673560614646e-09, "kl": 0.04437255859375, "learning_rate": 6.494171423881756e-08, "loss": 0.0018, "num_tokens": 208537145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2903 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.968, "grad_norm": 1.7335108903893115e-09, "kl": 0.0479736328125, "learning_rate": 6.362453511039368e-08, "loss": 0.0019, "num_tokens": 208612665.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2904 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9683333333333334, "grad_norm": 2.3246899960582823e-09, "kl": 0.04437255859375, "learning_rate": 6.232080839403631e-08, "loss": 0.0018, "num_tokens": 208689641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2905 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9686666666666667, "grad_norm": 1.3625027772690146e-09, "kl": 0.04541015625, "learning_rate": 6.103053585480023e-08, "loss": 0.0018, "num_tokens": 208766793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2906 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.969, "grad_norm": 1.274568006692789e-09, "kl": 0.04608154296875, "learning_rate": 5.975371923952921e-08, "loss": 0.0018, "num_tokens": 208841209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2907 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9693333333333334, "grad_norm": 9.741473183666471e-10, "kl": 0.0457763671875, "learning_rate": 5.849036027684607e-08, "loss": 0.0018, "num_tokens": 208915161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2908 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9696666666666667, "grad_norm": 1.227282053761769e-09, "kl": 0.041748046875, "learning_rate": 5.724046067715705e-08, "loss": 0.0017, "num_tokens": 208991033.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2909 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.97, "grad_norm": 1.3553156374968012e-09, "kl": 0.04925537109375, "learning_rate": 5.600402213264411e-08, "loss": 0.002, "num_tokens": 209067465.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2910 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9703333333333334, "grad_norm": 1.697823548418853e-09, "kl": 0.0477294921875, "learning_rate": 5.4781046317267103e-08, "loss": 0.0019, "num_tokens": 209142793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2911 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9706666666666667, "grad_norm": 1.3070936555337198e-09, "kl": 0.0447998046875, "learning_rate": 5.3571534886756035e-08, "loss": 0.0018, "num_tokens": 209217657.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2912 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.971, "grad_norm": 1.4478411802798519e-09, "kl": 0.04486083984375, "learning_rate": 5.2375489478616593e-08, "loss": 0.0018, "num_tokens": 209292297.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2913 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9713333333333334, "grad_norm": 1.4970747974629717e-09, "kl": 0.044189453125, "learning_rate": 5.119291171211793e-08, "loss": 0.0018, "num_tokens": 209367161.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2914 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9716666666666667, "grad_norm": 1.5472362280277707e-09, "kl": 0.04730224609375, "learning_rate": 5.002380318830158e-08, "loss": 0.0019, "num_tokens": 209443545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2915 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.972, "grad_norm": 2.7980964212304116e-09, "kl": 0.0452880859375, "learning_rate": 4.88681654899692e-08, "loss": 0.0018, "num_tokens": 209519881.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2916 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9723333333333334, "grad_norm": 1.9377759397087857e-09, "kl": 0.04351806640625, "learning_rate": 4.772600018168816e-08, "loss": 0.0017, "num_tokens": 209599225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2917 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9726666666666667, "grad_norm": 9.536348377636727e-10, "kl": 0.0439453125, "learning_rate": 4.659730880978375e-08, "loss": 0.0018, "num_tokens": 209673049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2918 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.973, "grad_norm": 1.9756634106471438e-09, "kl": 0.0474853515625, "learning_rate": 4.54820929023414e-08, "loss": 0.0019, "num_tokens": 209749225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2919 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9733333333333334, "grad_norm": 1.550075512390947e-09, "kl": 0.0458984375, "learning_rate": 4.438035396920004e-08, "loss": 0.0018, "num_tokens": 209827097.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2920 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9736666666666667, "grad_norm": 1.2801216753288713e-09, "kl": 0.0469970703125, "learning_rate": 4.329209350195651e-08, "loss": 0.0019, "num_tokens": 209900393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2921 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.974, "grad_norm": 1.7890703363221405e-09, "kl": 0.04486083984375, "learning_rate": 4.2217312973955594e-08, "loss": 0.0018, "num_tokens": 209976793.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2922 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9743333333333334, "grad_norm": 9.99578309013316e-10, "kl": 0.04931640625, "learning_rate": 4.115601384029666e-08, "loss": 0.002, "num_tokens": 210052041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2923 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9746666666666667, "grad_norm": 2.530295750702294e-09, "kl": 0.0452880859375, "learning_rate": 4.010819753782369e-08, "loss": 0.0018, "num_tokens": 210128873.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2924 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.975, "grad_norm": 1.4857606256413192e-09, "kl": 0.0472412109375, "learning_rate": 3.907386548512748e-08, "loss": 0.0019, "num_tokens": 210204585.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2925 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9753333333333334, "grad_norm": 1.181739484046318e-09, "kl": 0.042724609375, "learning_rate": 3.805301908254455e-08, "loss": 0.0017, "num_tokens": 210280009.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2926 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9756666666666667, "grad_norm": 1.7818566622196386e-09, "kl": 0.0450439453125, "learning_rate": 3.704565971215379e-08, "loss": 0.0018, "num_tokens": 210355865.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2927 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.976, "grad_norm": 1.3478381744036483e-09, "kl": 0.04571533203125, "learning_rate": 3.605178873777204e-08, "loss": 0.0018, "num_tokens": 210429641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2928 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9763333333333334, "grad_norm": 9.184745186630039e-10, "kl": 0.0430908203125, "learning_rate": 3.50714075049563e-08, "loss": 0.0017, "num_tokens": 210504073.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2929 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9766666666666667, "grad_norm": 3.5453104896276955e-09, "kl": 0.04327392578125, "learning_rate": 3.410451734100262e-08, "loss": 0.0017, "num_tokens": 210582713.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2930 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.977, "grad_norm": 1.5028326361132827e-09, "kl": 0.04400634765625, "learning_rate": 3.315111955493944e-08, "loss": 0.0018, "num_tokens": 210659209.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2931 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9773333333333334, "grad_norm": 1.7363386284330318e-09, "kl": 0.0452880859375, "learning_rate": 3.22112154375287e-08, "loss": 0.0018, "num_tokens": 210735129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2932 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9776666666666667, "grad_norm": 1.4197287789841084e-09, "kl": 0.04571533203125, "learning_rate": 3.1284806261264735e-08, "loss": 0.0018, "num_tokens": 210810441.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2933 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.978, "grad_norm": 1.538000504730519e-09, "kl": 0.0491943359375, "learning_rate": 3.037189328036982e-08, "loss": 0.002, "num_tokens": 210884745.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2934 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9783333333333334, "grad_norm": 2.8975926102958738e-09, "kl": 0.04522705078125, "learning_rate": 2.947247773079753e-08, "loss": 0.0018, "num_tokens": 210962633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2935 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9786666666666667, "grad_norm": 1.5917376305907283e-09, "kl": 0.04351806640625, "learning_rate": 2.858656083022604e-08, "loss": 0.0017, "num_tokens": 211038937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2936 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.979, "grad_norm": 8.104458770752387e-10, "kl": 0.04290771484375, "learning_rate": 2.7714143778058146e-08, "loss": 0.0017, "num_tokens": 211112841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2937 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9793333333333333, "grad_norm": 1.6967492966202258e-09, "kl": 0.04425048828125, "learning_rate": 2.6855227755419046e-08, "loss": 0.0018, "num_tokens": 211191225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2938 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9796666666666667, "grad_norm": 2.49811260566446e-09, "kl": 0.04876708984375, "learning_rate": 2.6009813925157446e-08, "loss": 0.0019, "num_tokens": 211266281.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2939 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.98, "grad_norm": 1.821839790139279e-09, "kl": 0.04718017578125, "learning_rate": 2.5177903431842233e-08, "loss": 0.0019, "num_tokens": 211342377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2940 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9803333333333333, "grad_norm": 1.2046053043945903e-09, "kl": 0.04461669921875, "learning_rate": 2.4359497401758026e-08, "loss": 0.0018, "num_tokens": 211416777.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2941 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9806666666666667, "grad_norm": 1.3474217297471114e-09, "kl": 0.04681396484375, "learning_rate": 2.3554596942907404e-08, "loss": 0.0019, "num_tokens": 211491065.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2942 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.981, "grad_norm": 1.6224258603259045e-09, "kl": 0.04571533203125, "learning_rate": 2.2763203145010904e-08, "loss": 0.0018, "num_tokens": 211567145.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2943 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9813333333333333, "grad_norm": 1.3303408374909509e-09, "kl": 0.04632568359375, "learning_rate": 2.1985317079500358e-08, "loss": 0.0019, "num_tokens": 211641929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2944 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9816666666666667, "grad_norm": 1.0727693178225195e-09, "kl": 0.04339599609375, "learning_rate": 2.1220939799520003e-08, "loss": 0.0017, "num_tokens": 211717049.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2945 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.982, "grad_norm": 1.5592430679944869e-09, "kl": 0.0450439453125, "learning_rate": 2.0470072339926482e-08, "loss": 0.0018, "num_tokens": 211791913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2946 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9823333333333333, "grad_norm": 1.6663457280685634e-09, "kl": 0.0469970703125, "learning_rate": 1.973271571728441e-08, "loss": 0.0019, "num_tokens": 211868921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2947 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9826666666666667, "grad_norm": 1.7467541857385527e-09, "kl": 0.04486083984375, "learning_rate": 1.9008870929869692e-08, "loss": 0.0018, "num_tokens": 211947673.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2948 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.983, "grad_norm": 1.6448671313895602e-09, "kl": 0.04595947265625, "learning_rate": 1.829853895766176e-08, "loss": 0.0018, "num_tokens": 212021193.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2949 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9833333333333333, "grad_norm": 2.2270771893317942e-09, "kl": 0.04315185546875, "learning_rate": 1.7601720762346895e-08, "loss": 0.0017, "num_tokens": 212099225.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2950 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9836666666666667, "grad_norm": 1.8324621819942877e-09, "kl": 0.0440673828125, "learning_rate": 1.6918417287318245e-08, "loss": 0.0018, "num_tokens": 212178393.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2951 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.984, "grad_norm": 1.5053276403165228e-09, "kl": 0.04345703125, "learning_rate": 1.624862945766692e-08, "loss": 0.0017, "num_tokens": 212254041.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2952 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9843333333333333, "grad_norm": 1.5199475011939967e-09, "kl": 0.04852294921875, "learning_rate": 1.5592358180189782e-08, "loss": 0.0019, "num_tokens": 212329129.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2953 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9846666666666667, "grad_norm": 5.194201957436917e-09, "kl": 0.0499267578125, "learning_rate": 1.4949604343383882e-08, "loss": 0.002, "num_tokens": 212407929.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2954 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.985, "grad_norm": 2.071116611546131e-09, "kl": 0.04473876953125, "learning_rate": 1.4320368817443142e-08, "loss": 0.0018, "num_tokens": 212487513.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2955 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9853333333333333, "grad_norm": 9.866411021519639e-10, "kl": 0.044189453125, "learning_rate": 1.370465245426167e-08, "loss": 0.0018, "num_tokens": 212560921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2956 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9856666666666667, "grad_norm": 2.2598105609006325e-09, "kl": 0.04339599609375, "learning_rate": 1.3102456087430437e-08, "loss": 0.0017, "num_tokens": 212640537.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2957 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.986, "grad_norm": 2.6744608749851295e-09, "kl": 0.04046630859375, "learning_rate": 1.2513780532236175e-08, "loss": 0.0016, "num_tokens": 212716841.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2958 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9863333333333333, "grad_norm": 1.0582833498418154e-09, "kl": 0.0440673828125, "learning_rate": 1.1938626585660252e-08, "loss": 0.0018, "num_tokens": 212792137.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2959 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9866666666666667, "grad_norm": 1.0848154596843074e-09, "kl": 0.04638671875, "learning_rate": 1.1376995026376459e-08, "loss": 0.0019, "num_tokens": 212866473.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2960 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.987, "grad_norm": 1.36530775574073e-09, "kl": 0.0452880859375, "learning_rate": 1.0828886614754342e-08, "loss": 0.0018, "num_tokens": 212939721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2961 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9873333333333333, "grad_norm": 1.657683545985833e-09, "kl": 0.04949951171875, "learning_rate": 1.0294302092853647e-08, "loss": 0.002, "num_tokens": 213017641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2962 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9876666666666667, "grad_norm": 1.4429349937117308e-09, "kl": 0.0435791015625, "learning_rate": 9.773242184422105e-09, "loss": 0.0017, "num_tokens": 213093305.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2963 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.988, "grad_norm": 3.4310256857850163e-09, "kl": 0.04547119140625, "learning_rate": 9.265707594899864e-09, "loss": 0.0018, "num_tokens": 213170377.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2964 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9883333333333333, "grad_norm": 1.935559934551634e-09, "kl": 0.0472412109375, "learning_rate": 8.771699011416169e-09, "loss": 0.0019, "num_tokens": 213247577.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2965 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9886666666666667, "grad_norm": 1.523074333320551e-09, "kl": 0.043701171875, "learning_rate": 8.29121710278713e-09, "loss": 0.0017, "num_tokens": 213322633.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2966 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.989, "grad_norm": 1.2285142902968005e-09, "kl": 0.04644775390625, "learning_rate": 7.824262519514625e-09, "loss": 0.0019, "num_tokens": 213398233.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2967 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9893333333333333, "grad_norm": 9.537842737827873e-10, "kl": 0.04425048828125, "learning_rate": 7.370835893788508e-09, "loss": 0.0018, "num_tokens": 213473689.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2968 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9896666666666667, "grad_norm": 1.1185187220874582e-09, "kl": 0.042236328125, "learning_rate": 6.930937839481067e-09, "loss": 0.0017, "num_tokens": 213550489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2969 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.99, "grad_norm": 1.2443580610366212e-09, "kl": 0.0458984375, "learning_rate": 6.504568952152568e-09, "loss": 0.0018, "num_tokens": 213624761.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2970 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9903333333333333, "grad_norm": 1.9045065524636584e-09, "kl": 0.04449462890625, "learning_rate": 6.091729809042379e-09, "loss": 0.0018, "num_tokens": 213700953.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2971 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9906666666666667, "grad_norm": 1.3548655530826181e-09, "kl": 0.04327392578125, "learning_rate": 5.6924209690767395e-09, "loss": 0.0017, "num_tokens": 213775721.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2972 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.991, "grad_norm": 1.8117720657073733e-09, "kl": 0.0498046875, "learning_rate": 5.306642972862097e-09, "loss": 0.002, "num_tokens": 213852889.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2973 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9913333333333333, "grad_norm": 1.3182867020233857e-09, "kl": 0.04449462890625, "learning_rate": 4.9343963426840006e-09, "loss": 0.0018, "num_tokens": 213927913.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2974 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9916666666666667, "grad_norm": 2.8226203596659616e-09, "kl": 0.04742431640625, "learning_rate": 4.575681582512648e-09, "loss": 0.0019, "num_tokens": 214003529.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2975 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.992, "grad_norm": 3.2701474861340785e-09, "kl": 0.04498291015625, "learning_rate": 4.230499177994007e-09, "loss": 0.0018, "num_tokens": 214081433.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2976 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9923333333333333, "grad_norm": 2.0030501701739922e-09, "kl": 0.044921875, "learning_rate": 3.898849596456477e-09, "loss": 0.0018, "num_tokens": 214155545.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2977 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9926666666666667, "grad_norm": 1.888650125181357e-09, "kl": 0.0489501953125, "learning_rate": 3.5807332869042256e-09, "loss": 0.002, "num_tokens": 214231993.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2978 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.993, "grad_norm": 2.888982386650696e-09, "kl": 0.04742431640625, "learning_rate": 3.276150680021628e-09, "loss": 0.0019, "num_tokens": 214311561.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2979 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9933333333333333, "grad_norm": 1.0059407751228377e-09, "kl": 0.04742431640625, "learning_rate": 2.9851021881688314e-09, "loss": 0.0019, "num_tokens": 214385641.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2980 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9936666666666667, "grad_norm": 1.671291660620966e-09, "kl": 0.048828125, "learning_rate": 2.7075882053828605e-09, "loss": 0.002, "num_tokens": 214461945.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2981 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.994, "grad_norm": 2.2417905309879416e-09, "kl": 0.04388427734375, "learning_rate": 2.4436091073787304e-09, "loss": 0.0018, "num_tokens": 214541289.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2982 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9943333333333333, "grad_norm": 2.0122878918726883e-09, "kl": 0.045654296875, "learning_rate": 2.193165251545004e-09, "loss": 0.0018, "num_tokens": 214615785.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2983 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9946666666666667, "grad_norm": 1.3068360837920068e-09, "kl": 0.04376220703125, "learning_rate": 1.956256976947124e-09, "loss": 0.0018, "num_tokens": 214691385.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2984 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.995, "grad_norm": 1.7166823518266483e-09, "kl": 0.04534912109375, "learning_rate": 1.7328846043229707e-09, "loss": 0.0018, "num_tokens": 214767897.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2985 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9953333333333333, "grad_norm": 1.6194776630840124e-09, "kl": 0.0521240234375, "learning_rate": 1.5230484360873043e-09, "loss": 0.0021, "num_tokens": 214844217.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2986 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9956666666666667, "grad_norm": 1.1248711961897584e-09, "kl": 0.04620361328125, "learning_rate": 1.3267487563284332e-09, "loss": 0.0019, "num_tokens": 214922697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2987 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.996, "grad_norm": 9.915434029394987e-10, "kl": 0.04498291015625, "learning_rate": 1.1439858308071038e-09, "loss": 0.0018, "num_tokens": 214997449.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2988 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9963333333333333, "grad_norm": 1.6820052017862963e-09, "kl": 0.04534912109375, "learning_rate": 9.74759906957612e-10, "loss": 0.0018, "num_tokens": 215072921.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2989 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9966666666666667, "grad_norm": 2.472805737951944e-09, "kl": 0.043701171875, "learning_rate": 8.19071213887801e-10, "loss": 0.0017, "num_tokens": 215152937.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2990 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.997, "grad_norm": 1.426590956477014e-09, "kl": 0.0416259765625, "learning_rate": 6.769199623779532e-10, "loss": 0.0017, "num_tokens": 215227705.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2991 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9973333333333333, "grad_norm": 1.0423627516686906e-09, "kl": 0.0418701171875, "learning_rate": 5.483063448785686e-10, "loss": 0.0017, "num_tokens": 215303561.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2992 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9976666666666667, "grad_norm": 1.9338459722462176e-09, "kl": 0.0458984375, "learning_rate": 4.332305355159161e-10, "loss": 0.0018, "num_tokens": 215383241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2993 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.998, "grad_norm": 2.1644668279918733e-09, "kl": 0.0440673828125, "learning_rate": 3.316926900842621e-10, "loss": 0.0018, "num_tokens": 215461625.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2994 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9983333333333333, "grad_norm": 1.713874042685859e-09, "kl": 0.04571533203125, "learning_rate": 2.436929460525317e-10, "loss": 0.0018, "num_tokens": 215538601.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2995 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9986666666666667, "grad_norm": 1.55350254882336e-09, "kl": 0.04632568359375, "learning_rate": 1.6923142255764745e-10, "loss": 0.0019, "num_tokens": 215612489.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2996 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.999, "grad_norm": 2.735165649525584e-09, "kl": 0.04730224609375, "learning_rate": 1.0830822041230093e-10, "loss": 0.0019, "num_tokens": 215688697.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2997 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9993333333333333, "grad_norm": 2.0403514433553482e-09, "kl": 0.0482177734375, "learning_rate": 6.092342209607083e-11, "loss": 0.0019, "num_tokens": 215763817.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2998 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9996666666666667, "grad_norm": 1.1584365688932508e-09, "kl": 0.0440673828125, "learning_rate": 2.7077091762084396e-11, "loss": 0.0018, "num_tokens": 215839241.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 2999 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0, "grad_norm": 1.9258936667654325e-09, "kl": 0.0445556640625, "learning_rate": 6.7692752314663104e-12, "loss": 0.0018, "num_tokens": 215917753.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/reasoning_steps_reward/mean": 0.0, "rewards/reasoning_steps_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 3000 }, { "epoch": 1.0, "step": 3000, "total_flos": 0.0, "train_loss": 0.01067225778910021, "train_runtime": 90873.507, "train_samples_per_second": 0.132, "train_steps_per_second": 0.033 } ], "logging_steps": 1, "max_steps": 3000, "num_input_tokens_seen": 215917753, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }