{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 500,
  "global_step": 3000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 871.25,
      "completions/max_terminated_length": 820.5,
      "completions/mean_length": 635.265625,
      "completions/mean_terminated_length": 614.8063507080078,
      "completions/min_length": 324.5,
      "completions/min_terminated_length": 324.5,
      "epoch": 0.0003333333333333333,
      "grad_norm": 0.7860351204872131,
      "kl": 0.0,
      "learning_rate": 0.0,
      "loss": 0.0304,
      "num_tokens": 50705.0,
      "reward": 0.22903646156191826,
      "reward_std": 0.18850377202033997,
      "rewards/format_reward/mean": 0.015625,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.4166666716337204,
      "rewards/reasoning_steps_reward/std": 0.36324381828308105,
      "rewards/tag_count_reward/mean": 0.14453125,
      "rewards/tag_count_reward/std": 0.25423113256692886,
      "step": 1
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 921.0,
      "completions/max_terminated_length": 869.75,
      "completions/mean_length": 679.21875,
      "completions/mean_terminated_length": 612.8114471435547,
      "completions/min_length": 237.0,
      "completions/min_terminated_length": 237.0,
      "epoch": 0.0006666666666666666,
      "grad_norm": 0.6927920579910278,
      "kl": 0.0,
      "learning_rate": 6.666666666666668e-08,
      "loss": -0.112,
      "num_tokens": 105535.0,
      "reward": 0.28307291865348816,
      "reward_std": 0.20059899613261223,
      "rewards/format_reward/mean": 0.0625,
      "rewards/format_reward/std": 0.21039126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.4895833507180214,
      "rewards/reasoning_steps_reward/std": 0.34962671622633934,
      "rewards/tag_count_reward/mean": 0.1328125,
      "rewards/tag_count_reward/std": 0.28246864676475525,
      "step": 2
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 989.75,
      "completions/max_terminated_length": 917.75,
      "completions/mean_length": 609.421875,
      "completions/mean_terminated_length": 550.6205444335938,
      "completions/min_length": 274.5,
      "completions/min_terminated_length": 274.5,
      "epoch": 0.001,
      "grad_norm": 0.8291285037994385,
      "kl": 0.00030994415283203125,
      "learning_rate": 1.3333333333333336e-07,
      "loss": -0.0794,
      "num_tokens": 153722.0,
      "reward": 0.16028646053746343,
      "reward_std": 0.1685022683814168,
      "rewards/format_reward/mean": 0.015625,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.29166666930541396,
      "rewards/reasoning_steps_reward/std": 0.32445300184190273,
      "rewards/tag_count_reward/mean": 0.08203125,
      "rewards/tag_count_reward/std": 0.2137749269604683,
      "step": 3
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 987.75,
      "completions/max_terminated_length": 894.25,
      "completions/mean_length": 632.671875,
      "completions/mean_terminated_length": 609.0352783203125,
      "completions/min_length": 299.25,
      "completions/min_terminated_length": 299.25,
      "epoch": 0.0013333333333333333,
      "grad_norm": 0.6764264702796936,
      "kl": 0.0003440380096435547,
      "learning_rate": 2.0000000000000002e-07,
      "loss": 0.0428,
      "num_tokens": 204677.0,
      "reward": 0.45119864493608475,
      "reward_std": 0.5436567962169647,
      "rewards/format_reward/mean": 0.03125,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.17671946436166763,
      "rewards/penalized_accuracy_reward/std": 0.4693755432963371,
      "rewards/reasoning_steps_reward/mean": 0.4895833432674408,
      "rewards/reasoning_steps_reward/std": 0.364622987806797,
      "rewards/tag_count_reward/mean": 0.171875,
      "rewards/tag_count_reward/std": 0.321233332157135,
      "step": 4
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 890.25,
      "completions/max_terminated_length": 841.75,
      "completions/mean_length": 495.625,
      "completions/mean_terminated_length": 484.78125762939453,
      "completions/min_length": 206.0,
      "completions/min_terminated_length": 206.0,
      "epoch": 0.0016666666666666668,
      "grad_norm": 0.9639339447021484,
      "kl": 0.0004448890686035156,
      "learning_rate": 2.666666666666667e-07,
      "loss": -0.1647,
      "num_tokens": 245197.0,
      "reward": 0.2981770969927311,
      "reward_std": 0.34278450906276703,
      "rewards/format_reward/mean": 0.015625,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.07265625149011612,
      "rewards/penalized_accuracy_reward/std": 0.15937501192092896,
      "rewards/reasoning_steps_reward/mean": 0.4010416679084301,
      "rewards/reasoning_steps_reward/std": 0.41512854397296906,
      "rewards/tag_count_reward/mean": 0.1875,
      "rewards/tag_count_reward/std": 0.3145124241709709,
      "step": 5
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 988.0,
      "completions/max_terminated_length": 960.0,
      "completions/mean_length": 731.109375,
      "completions/mean_terminated_length": 701.204345703125,
      "completions/min_length": 423.0,
      "completions/min_terminated_length": 423.0,
      "epoch": 0.002,
      "grad_norm": 0.7105857729911804,
      "kl": 0.00043010711669921875,
      "learning_rate": 3.3333333333333335e-07,
      "loss": -0.0603,
      "num_tokens": 305460.0,
      "reward": 0.17559084296226501,
      "reward_std": 0.3042265921831131,
      "rewards/format_reward/mean": 0.015625,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.031580422073602676,
      "rewards/penalized_accuracy_reward/std": 0.1263216882944107,
      "rewards/reasoning_steps_reward/mean": 0.2708333432674408,
      "rewards/reasoning_steps_reward/std": 0.3607480823993683,
      "rewards/tag_count_reward/mean": 0.0234375,
      "rewards/tag_count_reward/std": 0.06404344737529755,
      "step": 6
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1018.25,
      "completions/max_terminated_length": 893.5,
      "completions/mean_length": 684.03125,
      "completions/mean_terminated_length": 628.0986251831055,
      "completions/min_length": 351.25,
      "completions/min_terminated_length": 351.25,
      "epoch": 0.0023333333333333335,
      "grad_norm": 0.7914196848869324,
      "kl": 0.00037479400634765625,
      "learning_rate": 4.0000000000000003e-07,
      "loss": -0.0261,
      "num_tokens": 360710.0,
      "reward": 0.3031249940395355,
      "reward_std": 0.24116554856300354,
      "rewards/format_reward/mean": 0.0625,
      "rewards/format_reward/std": 0.21039126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.5312500074505806,
      "rewards/reasoning_steps_reward/std": 0.4420531764626503,
      "rewards/tag_count_reward/mean": 0.125,
      "rewards/tag_count_reward/std": 0.26729242503643036,
      "step": 7
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 874.75,
      "completions/max_terminated_length": 825.0,
      "completions/mean_length": 621.640625,
      "completions/mean_terminated_length": 583.5256652832031,
      "completions/min_length": 289.5,
      "completions/min_terminated_length": 289.5,
      "epoch": 0.0026666666666666666,
      "grad_norm": 0.6808961629867554,
      "kl": 0.000308990478515625,
      "learning_rate": 4.666666666666667e-07,
      "loss": 0.0044,
      "num_tokens": 408223.0,
      "reward": 0.16835937835276127,
      "reward_std": 0.1593070924282074,
      "rewards/format_reward/mean": 0.046875,
      "rewards/format_reward/std": 0.10077822208404541,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.2812500111758709,
      "rewards/reasoning_steps_reward/std": 0.2916427403688431,
      "rewards/tag_count_reward/mean": 0.08984375,
      "rewards/tag_count_reward/std": 0.21786238253116608,
      "step": 8
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 926.25,
      "completions/max_terminated_length": 918.0,
      "completions/mean_length": 631.046875,
      "completions/mean_terminated_length": 597.6302185058594,
      "completions/min_length": 348.5,
      "completions/min_terminated_length": 348.5,
      "epoch": 0.003,
      "grad_norm": 0.7980362772941589,
      "kl": 0.0003371238708496094,
      "learning_rate": 5.333333333333335e-07,
      "loss": -0.0431,
      "num_tokens": 457730.0,
      "reward": 0.37369417771697044,
      "reward_std": 0.43127137050032616,
      "rewards/format_reward/mean": 0.03125,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.13189728558063507,
      "rewards/penalized_accuracy_reward/std": 0.33436600118875504,
      "rewards/reasoning_steps_reward/mean": 0.4375000074505806,
      "rewards/reasoning_steps_reward/std": 0.3658451661467552,
      "rewards/tag_count_reward/mean": 0.10546875,
      "rewards/tag_count_reward/std": 0.26681406423449516,
      "step": 9
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 953.25,
      "completions/max_terminated_length": 921.25,
      "completions/mean_length": 744.0625,
      "completions/mean_terminated_length": 680.9791870117188,
      "completions/min_length": 445.0,
      "completions/min_terminated_length": 445.0,
      "epoch": 0.0033333333333333335,
      "grad_norm": 0.5989229083061218,
      "kl": 0.0002636909484863281,
      "learning_rate": 6.000000000000001e-07,
      "loss": 0.0488,
      "num_tokens": 515910.0,
      "reward": 0.3035855982452631,
      "reward_std": 0.37139888666570187,
      "rewards/format_reward/mean": 0.015625,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.1660856008529663,
      "rewards/penalized_accuracy_reward/std": 0.2546969950199127,
      "rewards/reasoning_steps_reward/mean": 0.2499999962747097,
      "rewards/reasoning_steps_reward/std": 0.22616704553365707,
      "rewards/tag_count_reward/mean": 0.0625,
      "rewards/tag_count_reward/std": 0.1956711709499359,
      "step": 10
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 893.25,
      "completions/max_terminated_length": 866.25,
      "completions/mean_length": 608.453125,
      "completions/mean_terminated_length": 562.5375061035156,
      "completions/min_length": 249.75,
      "completions/min_terminated_length": 249.75,
      "epoch": 0.0036666666666666666,
      "grad_norm": 0.8321331739425659,
      "kl": 0.0003871917724609375,
      "learning_rate": 6.666666666666667e-07,
      "loss": -0.0635,
      "num_tokens": 566691.0,
      "reward": 0.3195977807044983,
      "reward_std": 0.2748823333531618,
      "rewards/format_reward/mean": 0.0625,
      "rewards/format_reward/std": 0.17078252136707306,
      "rewards/penalized_accuracy_reward/mean": 0.027670694515109062,
      "rewards/penalized_accuracy_reward/std": 0.11068278551101685,
      "rewards/reasoning_steps_reward/mean": 0.4947916716337204,
      "rewards/reasoning_steps_reward/std": 0.3461822122335434,
      "rewards/tag_count_reward/mean": 0.1953125,
      "rewards/tag_count_reward/std": 0.3081725612282753,
      "step": 11
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 947.25,
      "completions/max_terminated_length": 913.5,
      "completions/mean_length": 700.21875,
      "completions/mean_terminated_length": 651.2534484863281,
      "completions/min_length": 370.75,
      "completions/min_terminated_length": 370.75,
      "epoch": 0.004,
      "grad_norm": 0.64812833070755,
      "kl": 0.0003380775451660156,
      "learning_rate": 7.333333333333334e-07,
      "loss": -0.0634,
      "num_tokens": 620673.0,
      "reward": 0.28919271379709244,
      "reward_std": 0.19906166940927505,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.557291679084301,
      "rewards/reasoning_steps_reward/std": 0.3937826156616211,
      "rewards/tag_count_reward/mean": 0.10546875,
      "rewards/tag_count_reward/std": 0.2427176907658577,
      "step": 12
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 932.5,
      "completions/max_terminated_length": 899.25,
      "completions/mean_length": 684.0625,
      "completions/mean_terminated_length": 641.3697967529297,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 0.004333333333333333,
      "grad_norm": 0.7295641899108887,
      "kl": 0.0003490447998046875,
      "learning_rate": 8.000000000000001e-07,
      "loss": -0.0096,
      "num_tokens": 673253.0,
      "reward": 0.19283854216337204,
      "reward_std": 0.23215484991669655,
      "rewards/format_reward/mean": 0.046875,
      "rewards/format_reward/std": 0.14789126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.3177083432674408,
      "rewards/reasoning_steps_reward/std": 0.38450102508068085,
      "rewards/tag_count_reward/mean": 0.15234375,
      "rewards/tag_count_reward/std": 0.3064930960536003,
      "step": 13
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 988.0,
      "completions/max_terminated_length": 930.75,
      "completions/mean_length": 740.4375,
      "completions/mean_terminated_length": 700.2864837646484,
      "completions/min_length": 381.25,
      "completions/min_terminated_length": 381.25,
      "epoch": 0.004666666666666667,
      "grad_norm": 0.6185789108276367,
      "kl": 0.00036716461181640625,
      "learning_rate": 8.666666666666668e-07,
      "loss": 0.0149,
      "num_tokens": 735169.0,
      "reward": 0.22842025011777878,
      "reward_std": 0.3898373916745186,
      "rewards/format_reward/mean": 0.015625,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.10159732773900032,
      "rewards/penalized_accuracy_reward/std": 0.3202204257249832,
      "rewards/reasoning_steps_reward/mean": 0.22395833767950535,
      "rewards/reasoning_steps_reward/std": 0.2862655222415924,
      "rewards/tag_count_reward/mean": 0.0859375,
      "rewards/tag_count_reward/std": 0.21858105063438416,
      "step": 14
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 956.75,
      "completions/mean_length": 719.765625,
      "completions/mean_terminated_length": 653.96728515625,
      "completions/min_length": 277.75,
      "completions/min_terminated_length": 277.75,
      "epoch": 0.005,
      "grad_norm": 0.672243595123291,
      "kl": 0.000499725341796875,
      "learning_rate": 9.333333333333334e-07,
      "loss": -0.0536,
      "num_tokens": 791810.0,
      "reward": 0.3214690247550607,
      "reward_std": 0.4147674571722746,
      "rewards/format_reward/mean": 0.078125,
      "rewards/format_reward/std": 0.18616948276758194,
      "rewards/penalized_accuracy_reward/mean": 0.14217214286327362,
      "rewards/penalized_accuracy_reward/std": 0.2546563148498535,
      "rewards/reasoning_steps_reward/mean": 0.250000006519258,
      "rewards/reasoning_steps_reward/std": 0.3242802955210209,
      "rewards/tag_count_reward/mean": 0.23046875,
      "rewards/tag_count_reward/std": 0.3277582451701164,
      "step": 15
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 735.25,
      "completions/max_terminated_length": 727.0,
      "completions/mean_length": 570.8125,
      "completions/mean_terminated_length": 559.7447967529297,
      "completions/min_length": 406.75,
      "completions/min_terminated_length": 406.75,
      "epoch": 0.005333333333333333,
      "grad_norm": 0.8322139382362366,
      "kl": 0.00043010711669921875,
      "learning_rate": 1.0000000000000002e-06,
      "loss": 0.0246,
      "num_tokens": 839238.0,
      "reward": 0.6507812291383743,
      "reward_std": 0.36999223567545414,
      "rewards/format_reward/mean": 0.0625,
      "rewards/format_reward/std": 0.21039126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.30078125,
      "rewards/penalized_accuracy_reward/std": 0.20943717658519745,
      "rewards/reasoning_steps_reward/mean": 0.6250000149011612,
      "rewards/reasoning_steps_reward/std": 0.3061271086335182,
      "rewards/tag_count_reward/mean": 0.125,
      "rewards/tag_count_reward/std": 0.28826820850372314,
      "step": 16
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 894.75,
      "completions/max_terminated_length": 852.25,
      "completions/mean_length": 616.734375,
      "completions/mean_terminated_length": 578.4989624023438,
      "completions/min_length": 225.5,
      "completions/min_terminated_length": 225.5,
      "epoch": 0.005666666666666667,
      "grad_norm": 0.6916234493255615,
      "kl": 0.0005021095275878906,
      "learning_rate": 1.066666666666667e-06,
      "loss": -0.0533,
      "num_tokens": 887797.0,
      "reward": 0.20742188021540642,
      "reward_std": 0.19378306157886982,
      "rewards/format_reward/mean": 0.03125,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.3437500037252903,
      "rewards/reasoning_steps_reward/std": 0.34409795701503754,
      "rewards/tag_count_reward/mean": 0.23046875,
      "rewards/tag_count_reward/std": 0.4111901819705963,
      "step": 17
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 861.25,
      "completions/max_terminated_length": 796.0,
      "completions/mean_length": 591.015625,
      "completions/mean_terminated_length": 568.8747711181641,
      "completions/min_length": 287.75,
      "completions/min_terminated_length": 287.75,
      "epoch": 0.006,
      "grad_norm": 0.9087463021278381,
      "kl": 0.0009756088256835938,
      "learning_rate": 1.1333333333333334e-06,
      "loss": 0.0337,
      "num_tokens": 934310.0,
      "reward": 0.19101562723517418,
      "reward_std": 0.2238428182899952,
      "rewards/format_reward/mean": 0.15625,
      "rewards/format_reward/std": 0.3604728877544403,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.1718750037252903,
      "rewards/reasoning_steps_reward/std": 0.3026356063783169,
      "rewards/tag_count_reward/mean": 0.42578125,
      "rewards/tag_count_reward/std": 0.4487849324941635,
      "step": 18
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 883.25,
      "completions/max_terminated_length": 788.25,
      "completions/mean_length": 555.65625,
      "completions/mean_terminated_length": 503.95638275146484,
      "completions/min_length": 198.0,
      "completions/min_terminated_length": 198.0,
      "epoch": 0.006333333333333333,
      "grad_norm": 0.945106029510498,
      "kl": 0.00121307373046875,
      "learning_rate": 1.2000000000000002e-06,
      "loss": -0.0063,
      "num_tokens": 979584.0,
      "reward": 0.3121093846857548,
      "reward_std": 0.27271439135074615,
      "rewards/format_reward/mean": 0.234375,
      "rewards/format_reward/std": 0.4255262687802315,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.3593750074505806,
      "rewards/reasoning_steps_reward/std": 0.3613637499511242,
      "rewards/tag_count_reward/mean": 0.38671875,
      "rewards/tag_count_reward/std": 0.4463852792978287,
      "step": 19
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 949.0,
      "completions/mean_length": 600.328125,
      "completions/mean_terminated_length": 566.3238296508789,
      "completions/min_length": 263.75,
      "completions/min_terminated_length": 263.75,
      "epoch": 0.006666666666666667,
      "grad_norm": 0.7967208027839661,
      "kl": 0.0017490386962890625,
      "learning_rate": 1.2666666666666669e-06,
      "loss": 0.0029,
      "num_tokens": 1028293.0,
      "reward": 0.33867188170552254,
      "reward_std": 0.2778767757117748,
      "rewards/format_reward/mean": 0.3125,
      "rewards/format_reward/std": 0.4581565484404564,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.3125000074505806,
      "rewards/reasoning_steps_reward/std": 0.36415334790945053,
      "rewards/tag_count_reward/mean": 0.57421875,
      "rewards/tag_count_reward/std": 0.42155271768569946,
      "step": 20
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 898.75,
      "completions/max_terminated_length": 840.25,
      "completions/mean_length": 640.125,
      "completions/mean_terminated_length": 591.8221282958984,
      "completions/min_length": 295.5,
      "completions/min_terminated_length": 295.5,
      "epoch": 0.007,
      "grad_norm": 0.7993925213813782,
      "kl": 0.0019083023071289062,
      "learning_rate": 1.3333333333333334e-06,
      "loss": -0.0005,
      "num_tokens": 1078381.0,
      "reward": 0.3596354275941849,
      "reward_std": 0.38519641384482384,
      "rewards/format_reward/mean": 0.171875,
      "rewards/format_reward/std": 0.37149807065725327,
      "rewards/penalized_accuracy_reward/mean": 0.08203125,
      "rewards/penalized_accuracy_reward/std": 0.17636188864707947,
      "rewards/reasoning_steps_reward/mean": 0.3333333469927311,
      "rewards/reasoning_steps_reward/std": 0.3621671050786972,
      "rewards/tag_count_reward/mean": 0.421875,
      "rewards/tag_count_reward/std": 0.422298327088356,
      "step": 21
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.265625,
      "completions/max_length": 900.75,
      "completions/max_terminated_length": 824.5,
      "completions/mean_length": 662.265625,
      "completions/mean_terminated_length": 595.5156402587891,
      "completions/min_length": 389.25,
      "completions/min_terminated_length": 389.25,
      "epoch": 0.007333333333333333,
      "grad_norm": 0.6876639127731323,
      "kl": 0.0032024383544921875,
      "learning_rate": 1.4000000000000001e-06,
      "loss": 0.0381,
      "num_tokens": 1132766.0,
      "reward": 0.7374160960316658,
      "reward_std": 0.6433865800499916,
      "rewards/format_reward/mean": 0.40625,
      "rewards/format_reward/std": 0.44495995342731476,
      "rewards/penalized_accuracy_reward/mean": 0.38207758590579033,
      "rewards/penalized_accuracy_reward/std": 0.5425488203763962,
      "rewards/reasoning_steps_reward/mean": 0.2552083395421505,
      "rewards/reasoning_steps_reward/std": 0.3171140179038048,
      "rewards/tag_count_reward/mean": 0.65234375,
      "rewards/tag_count_reward/std": 0.3613503724336624,
      "step": 22
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 813.75,
      "completions/max_terminated_length": 777.0,
      "completions/mean_length": 478.484375,
      "completions/mean_terminated_length": 471.4437561035156,
      "completions/min_length": 234.0,
      "completions/min_terminated_length": 234.0,
      "epoch": 0.007666666666666666,
      "grad_norm": 1.127966284751892,
      "kl": 0.005817413330078125,
      "learning_rate": 1.4666666666666669e-06,
      "loss": -0.0894,
      "num_tokens": 1173965.0,
      "reward": 0.611811488866806,
      "reward_std": 0.4041200578212738,
      "rewards/format_reward/mean": 0.546875,
      "rewards/format_reward/std": 0.48989029973745346,
      "rewards/penalized_accuracy_reward/mean": 0.07938960939645767,
      "rewards/penalized_accuracy_reward/std": 0.17101971805095673,
      "rewards/reasoning_steps_reward/mean": 0.4531250176951289,
      "rewards/reasoning_steps_reward/std": 0.33705293014645576,
      "rewards/tag_count_reward/mean": 0.87109375,
      "rewards/tag_count_reward/std": 0.2502099722623825,
      "step": 23
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 988.25,
      "completions/max_terminated_length": 892.0,
      "completions/mean_length": 654.796875,
      "completions/mean_terminated_length": 599.6551284790039,
      "completions/min_length": 289.75,
      "completions/min_terminated_length": 289.75,
      "epoch": 0.008,
      "grad_norm": 0.8260610699653625,
      "kl": 0.004428863525390625,
      "learning_rate": 1.5333333333333334e-06,
      "loss": -0.0424,
      "num_tokens": 1228112.0,
      "reward": 0.48359375447034836,
      "reward_std": 0.25352058187127113,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.48989029973745346,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.4687500149011612,
      "rewards/reasoning_steps_reward/std": 0.36968404054641724,
      "rewards/tag_count_reward/mean": 0.8046875,
      "rewards/tag_count_reward/std": 0.30307503417134285,
      "step": 24
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 969.75,
      "completions/max_terminated_length": 879.75,
      "completions/mean_length": 636.75,
      "completions/mean_terminated_length": 613.5290374755859,
      "completions/min_length": 303.25,
      "completions/min_terminated_length": 303.25,
      "epoch": 0.008333333333333333,
      "grad_norm": 0.7867465019226074,
      "kl": 0.00386810302734375,
      "learning_rate": 1.6000000000000001e-06,
      "loss": -0.0077,
      "num_tokens": 1280608.0,
      "reward": 0.6752409785985947,
      "reward_std": 0.5305026173591614,
      "rewards/format_reward/mean": 0.703125,
      "rewards/format_reward/std": 0.4503342807292938,
      "rewards/penalized_accuracy_reward/mean": 0.1337045282125473,
      "rewards/penalized_accuracy_reward/std": 0.36535558104515076,
      "rewards/reasoning_steps_reward/mean": 0.3385416753590107,
      "rewards/reasoning_steps_reward/std": 0.3921737000346184,
      "rewards/tag_count_reward/mean": 0.91015625,
      "rewards/tag_count_reward/std": 0.17918559536337852,
      "step": 25
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.203125,
      "completions/max_length": 956.25,
      "completions/max_terminated_length": 876.25,
      "completions/mean_length": 692.78125,
      "completions/mean_terminated_length": 641.17041015625,
      "completions/min_length": 356.75,
      "completions/min_terminated_length": 356.75,
      "epoch": 0.008666666666666666,
      "grad_norm": 1.0709264278411865,
      "kl": 0.020282745361328125,
      "learning_rate": 1.6666666666666667e-06,
      "loss": 0.0592,
      "num_tokens": 1335762.0,
      "reward": 0.49266771972179413,
      "reward_std": 0.3373037725687027,
      "rewards/format_reward/mean": 0.578125,
      "rewards/format_reward/std": 0.4612434431910515,
      "rewards/penalized_accuracy_reward/mean": 0.031599994748830795,
      "rewards/penalized_accuracy_reward/std": 0.12639997899532318,
      "rewards/reasoning_steps_reward/mean": 0.3072916828095913,
      "rewards/reasoning_steps_reward/std": 0.3412012457847595,
      "rewards/tag_count_reward/mean": 0.76171875,
      "rewards/tag_count_reward/std": 0.2921975739300251,
      "step": 26
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 862.5,
      "completions/max_terminated_length": 815.0,
      "completions/mean_length": 499.21875,
      "completions/mean_terminated_length": 490.92188262939453,
      "completions/min_length": 203.25,
      "completions/min_terminated_length": 203.25,
      "epoch": 0.009,
      "grad_norm": 0.7725469470024109,
      "kl": 0.00894927978515625,
      "learning_rate": 1.7333333333333336e-06,
      "loss": -0.0116,
      "num_tokens": 1378272.0,
      "reward": 0.6114357858896255,
      "reward_std": 0.35752149671316147,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.31687305867671967,
      "rewards/penalized_accuracy_reward/mean": 0.06482118368148804,
      "rewards/penalized_accuracy_reward/std": 0.17760424315929413,
      "rewards/reasoning_steps_reward/mean": 0.2447916753590107,
      "rewards/reasoning_steps_reward/std": 0.3417239859700203,
      "rewards/tag_count_reward/mean": 0.9296875,
      "rewards/tag_count_reward/std": 0.16852997615933418,
      "step": 27
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 907.75,
      "completions/max_terminated_length": 890.75,
      "completions/mean_length": 659.234375,
      "completions/mean_terminated_length": 618.823974609375,
      "completions/min_length": 172.0,
      "completions/min_terminated_length": 172.0,
      "epoch": 0.009333333333333334,
      "grad_norm": 0.6757703423500061,
      "kl": 0.00604248046875,
      "learning_rate": 1.8000000000000001e-06,
      "loss": -0.0214,
      "num_tokens": 1430415.0,
      "reward": 0.569692924618721,
      "reward_std": 0.35038041695952415,
      "rewards/format_reward/mean": 0.6875,
      "rewards/format_reward/std": 0.46566852182149887,
      "rewards/penalized_accuracy_reward/mean": 0.055760642513632774,
      "rewards/penalized_accuracy_reward/std": 0.2230425775051117,
      "rewards/reasoning_steps_reward/mean": 0.3177083395421505,
      "rewards/reasoning_steps_reward/std": 0.3475276306271553,
      "rewards/tag_count_reward/mean": 0.80078125,
      "rewards/tag_count_reward/std": 0.2796638309955597,
      "step": 28
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 894.5,
      "completions/max_terminated_length": 778.5,
      "completions/mean_length": 568.203125,
      "completions/mean_terminated_length": 526.1045837402344,
      "completions/min_length": 190.75,
      "completions/min_terminated_length": 190.75,
      "epoch": 0.009666666666666667,
      "grad_norm": 0.9557077884674072,
      "kl": 0.00942230224609375,
      "learning_rate": 1.8666666666666669e-06,
      "loss": 0.0206,
      "num_tokens": 1478412.0,
      "reward": 0.6090030297636986,
      "reward_std": 0.34788935631513596,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.4383598491549492,
      "rewards/penalized_accuracy_reward/mean": 0.0332217775285244,
      "rewards/penalized_accuracy_reward/std": 0.1328871250152588,
      "rewards/reasoning_steps_reward/mean": 0.4062500037252903,
      "rewards/reasoning_steps_reward/std": 0.37067270278930664,
      "rewards/tag_count_reward/mean": 0.7265625,
      "rewards/tag_count_reward/std": 0.372597873210907,
      "step": 29
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 986.75,
      "completions/max_terminated_length": 968.25,
      "completions/mean_length": 744.6875,
      "completions/mean_terminated_length": 701.7882080078125,
      "completions/min_length": 314.5,
      "completions/min_terminated_length": 314.5,
      "epoch": 0.01,
      "grad_norm": 0.6671625971794128,
      "kl": 0.0051116943359375,
      "learning_rate": 1.9333333333333336e-06,
      "loss": -0.0188,
      "num_tokens": 1535592.0,
      "reward": 0.6108183711767197,
      "reward_std": 0.3935040086507797,
      "rewards/format_reward/mean": 0.71875,
      "rewards/format_reward/std": 0.3811737895011902,
      "rewards/penalized_accuracy_reward/mean": 0.07006317377090454,
      "rewards/penalized_accuracy_reward/std": 0.1914493590593338,
      "rewards/reasoning_steps_reward/mean": 0.3385416828095913,
      "rewards/reasoning_steps_reward/std": 0.382925845682621,
      "rewards/tag_count_reward/mean": 0.83984375,
      "rewards/tag_count_reward/std": 0.23307598009705544,
      "step": 30
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.25,
      "completions/mean_length": 760.546875,
      "completions/mean_terminated_length": 627.3074493408203,
      "completions/min_length": 243.25,
      "completions/min_terminated_length": 243.25,
      "epoch": 0.010333333333333333,
      "grad_norm": 0.6423367857933044,
      "kl": 0.006683349609375,
      "learning_rate": 2.0000000000000003e-06,
      "loss": 0.0674,
      "num_tokens": 1597051.0,
      "reward": 0.5471354126930237,
      "reward_std": 0.2835099846124649,
      "rewards/format_reward/mean": 0.546875,
      "rewards/format_reward/std": 0.5049516260623932,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.5208333358168602,
      "rewards/reasoning_steps_reward/std": 0.39296089485287666,
      "rewards/tag_count_reward/mean": 0.6796875,
      "rewards/tag_count_reward/std": 0.3426893353462219,
      "step": 31
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 1021.75,
      "completions/max_terminated_length": 937.0,
      "completions/mean_length": 564.15625,
      "completions/mean_terminated_length": 526.6151962280273,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "epoch": 0.010666666666666666,
      "grad_norm": 0.8222790956497192,
      "kl": 0.0097503662109375,
      "learning_rate": 2.0666666666666666e-06,
      "loss": -0.002,
      "num_tokens": 1645301.0,
      "reward": 0.7227423191070557,
      "reward_std": 0.41062621772289276,
      "rewards/format_reward/mean": 0.71875,
      "rewards/format_reward/std": 0.42308124154806137,
      "rewards/penalized_accuracy_reward/mean": 0.1365443915128708,
      "rewards/penalized_accuracy_reward/std": 0.24466487765312195,
      "rewards/reasoning_steps_reward/mean": 0.4427083386108279,
      "rewards/reasoning_steps_reward/std": 0.31103401258587837,
      "rewards/tag_count_reward/mean": 0.7734375,
      "rewards/tag_count_reward/std": 0.3284476548433304,
      "step": 32
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 926.0,
      "completions/max_terminated_length": 908.75,
      "completions/mean_length": 638.296875,
      "completions/mean_terminated_length": 592.3899688720703,
      "completions/min_length": 232.25,
      "completions/min_terminated_length": 232.25,
      "epoch": 0.011,
      "grad_norm": 0.8139158487319946,
      "kl": 0.00734710693359375,
      "learning_rate": 2.133333333333334e-06,
      "loss": -0.075,
      "num_tokens": 1695816.0,
      "reward": 0.5666666626930237,
      "reward_std": 0.2244843989610672,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.4160471484065056,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.3645833358168602,
      "rewards/reasoning_steps_reward/std": 0.38267721980810165,
      "rewards/tag_count_reward/mean": 0.84375,
      "rewards/tag_count_reward/std": 0.2710249461233616,
      "step": 33
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 960.5,
      "completions/max_terminated_length": 829.75,
      "completions/mean_length": 529.84375,
      "completions/mean_terminated_length": 506.60001373291016,
      "completions/min_length": 135.5,
      "completions/min_terminated_length": 135.5,
      "epoch": 0.011333333333333334,
      "grad_norm": 0.817894697189331,
      "kl": 0.0108642578125,
      "learning_rate": 2.2e-06,
      "loss": -0.0127,
      "num_tokens": 1739294.0,
      "reward": 0.6891430914402008,
      "reward_std": 0.38321299850940704,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.34944770485162735,
      "rewards/penalized_accuracy_reward/mean": 0.07091392576694489,
      "rewards/penalized_accuracy_reward/std": 0.19396641850471497,
      "rewards/reasoning_steps_reward/mean": 0.3958333507180214,
      "rewards/reasoning_steps_reward/std": 0.39766644686460495,
      "rewards/tag_count_reward/mean": 0.765625,
      "rewards/tag_count_reward/std": 0.3521217107772827,
      "step": 34
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 886.5,
      "completions/max_terminated_length": 739.0,
      "completions/mean_length": 598.5,
      "completions/mean_terminated_length": 495.6458511352539,
      "completions/min_length": 225.5,
      "completions/min_terminated_length": 225.5,
      "epoch": 0.011666666666666667,
      "grad_norm": 0.843439519405365,
      "kl": 0.00963592529296875,
      "learning_rate": 2.266666666666667e-06,
      "loss": 0.0383,
      "num_tokens": 1787694.0,
      "reward": 0.6172518730163574,
      "reward_std": 0.42104343324899673,
      "rewards/format_reward/mean": 0.671875,
      "rewards/format_reward/std": 0.3956565484404564,
      "rewards/penalized_accuracy_reward/mean": 0.0732414573431015,
      "rewards/penalized_accuracy_reward/std": 0.20035846531391144,
      "rewards/reasoning_steps_reward/mean": 0.3958333432674408,
      "rewards/reasoning_steps_reward/std": 0.3839438855648041,
      "rewards/tag_count_reward/mean": 0.7734375,
      "rewards/tag_count_reward/std": 0.3030990958213806,
      "step": 35
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 931.0,
      "completions/max_terminated_length": 894.25,
      "completions/mean_length": 662.046875,
      "completions/mean_terminated_length": 623.0243072509766,
      "completions/min_length": 262.75,
      "completions/min_terminated_length": 262.75,
      "epoch": 0.012,
      "grad_norm": 0.7322201728820801,
      "kl": 0.0060577392578125,
      "learning_rate": 2.3333333333333336e-06,
      "loss": 0.0241,
      "num_tokens": 1839521.0,
      "reward": 0.6868381351232529,
      "reward_std": 0.4258001074194908,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.3384781554341316,
      "rewards/penalized_accuracy_reward/mean": 0.1107964739203453,
      "rewards/penalized_accuracy_reward/std": 0.23831558227539062,
      "rewards/reasoning_steps_reward/mean": 0.3177083535119891,
      "rewards/reasoning_steps_reward/std": 0.32344260439276695,
      "rewards/tag_count_reward/mean": 0.859375,
      "rewards/tag_count_reward/std": 0.26763106137514114,
      "step": 36
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 733.25,
      "completions/max_terminated_length": 698.0,
      "completions/mean_length": 423.625,
      "completions/mean_terminated_length": 402.15625,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "epoch": 0.012333333333333333,
      "grad_norm": 0.9728919863700867,
      "kl": 0.0107421875,
      "learning_rate": 2.4000000000000003e-06,
      "loss": -0.0521,
      "num_tokens": 1876105.0,
      "reward": 0.6976470351219177,
      "reward_std": 0.30700354278087616,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3221946656703949,
      "rewards/penalized_accuracy_reward/mean": 0.05988661199808121,
      "rewards/penalized_accuracy_reward/std": 0.1643446534872055,
      "rewards/reasoning_steps_reward/mean": 0.3958333507180214,
      "rewards/reasoning_steps_reward/std": 0.38543668389320374,
      "rewards/tag_count_reward/mean": 0.8984375,
      "rewards/tag_count_reward/std": 0.1949814110994339,
      "step": 37
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 957.0,
      "completions/max_terminated_length": 941.0,
      "completions/mean_length": 656.578125,
      "completions/mean_terminated_length": 629.3668365478516,
      "completions/min_length": 236.5,
      "completions/min_terminated_length": 236.5,
      "epoch": 0.012666666666666666,
      "grad_norm": 0.7981860637664795,
      "kl": 0.0076751708984375,
      "learning_rate": 2.466666666666667e-06,
      "loss": 0.0151,
      "num_tokens": 1928190.0,
      "reward": 0.5835937559604645,
      "reward_std": 0.21724450588226318,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.32438503205776215,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.3281250149011612,
      "rewards/reasoning_steps_reward/std": 0.400931254029274,
      "rewards/tag_count_reward/mean": 0.8828125,
      "rewards/tag_count_reward/std": 0.23317711055278778,
      "step": 38
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 990.5,
      "completions/max_terminated_length": 901.0,
      "completions/mean_length": 615.765625,
      "completions/mean_terminated_length": 587.3415298461914,
      "completions/min_length": 302.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 0.013,
      "grad_norm": 0.8061979413032532,
      "kl": 0.0093231201171875,
      "learning_rate": 2.5333333333333338e-06,
      "loss": -0.004,
      "num_tokens": 1978623.0,
      "reward": 0.8341648280620575,
      "reward_std": 0.5322102606296539,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.37585217505693436,
      "rewards/penalized_accuracy_reward/mean": 0.18755022436380386,
      "rewards/penalized_accuracy_reward/std": 0.404110312461853,
      "rewards/reasoning_steps_reward/mean": 0.4635416567325592,
      "rewards/reasoning_steps_reward/std": 0.377722904086113,
      "rewards/tag_count_reward/mean": 0.8359375,
      "rewards/tag_count_reward/std": 0.28358178213238716,
      "step": 39
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 863.25,
      "completions/max_terminated_length": 793.0,
      "completions/mean_length": 629.4375,
      "completions/mean_terminated_length": 578.8204498291016,
      "completions/min_length": 294.75,
      "completions/min_terminated_length": 294.75,
      "epoch": 0.013333333333333334,
      "grad_norm": 0.7144984006881714,
      "kl": 0.0075531005859375,
      "learning_rate": 2.6e-06,
      "loss": 0.0337,
      "num_tokens": 2028779.0,
      "reward": 0.6130538880825043,
      "reward_std": 0.3260304667055607,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.2257782220840454,
      "rewards/penalized_accuracy_reward/mean": 0.03518928587436676,
      "rewards/penalized_accuracy_reward/std": 0.14075715839862823,
      "rewards/reasoning_steps_reward/mean": 0.2916666716337204,
      "rewards/reasoning_steps_reward/std": 0.3576437309384346,
      "rewards/tag_count_reward/mean": 0.8828125,
      "rewards/tag_count_reward/std": 0.1747661679983139,
      "step": 40
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 711.75,
      "completions/max_terminated_length": 683.0,
      "completions/mean_length": 428.9375,
      "completions/mean_terminated_length": 417.7008972167969,
      "completions/min_length": 186.25,
      "completions/min_terminated_length": 186.25,
      "epoch": 0.013666666666666667,
      "grad_norm": 1.0025185346603394,
      "kl": 0.014068603515625,
      "learning_rate": 2.666666666666667e-06,
      "loss": -0.1125,
      "num_tokens": 2064119.0,
      "reward": 0.7020634412765503,
      "reward_std": 0.31718097999691963,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.18217839300632477,
      "rewards/penalized_accuracy_reward/mean": 0.034355103969573975,
      "rewards/penalized_accuracy_reward/std": 0.1374204158782959,
      "rewards/reasoning_steps_reward/mean": 0.4166666716337204,
      "rewards/reasoning_steps_reward/std": 0.3688225708901882,
      "rewards/tag_count_reward/mean": 0.96875,
      "rewards/tag_count_reward/std": 0.06649631634354591,
      "step": 41
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 904.0,
      "completions/mean_length": 765.71875,
      "completions/mean_terminated_length": 634.1123352050781,
      "completions/min_length": 374.0,
      "completions/min_terminated_length": 374.0,
      "epoch": 0.014,
      "grad_norm": 0.6406546831130981,
      "kl": 0.00603485107421875,
      "learning_rate": 2.7333333333333336e-06,
      "loss": 0.1538,
      "num_tokens": 2125669.0,
      "reward": 0.6540641784667969,
      "reward_std": 0.3556971549987793,
      "rewards/format_reward/mean": 0.703125,
      "rewards/format_reward/std": 0.45028156042099,
      "rewards/penalized_accuracy_reward/mean": 0.03492354974150658,
      "rewards/penalized_accuracy_reward/std": 0.1396941989660263,
      "rewards/reasoning_steps_reward/mean": 0.5312500149011612,
      "rewards/reasoning_steps_reward/std": 0.3982694745063782,
      "rewards/tag_count_reward/mean": 0.72265625,
      "rewards/tag_count_reward/std": 0.33274202048778534,
      "step": 42
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 1011.75,
      "completions/max_terminated_length": 994.25,
      "completions/mean_length": 819.296875,
      "completions/mean_terminated_length": 770.6994323730469,
      "completions/min_length": 529.75,
      "completions/min_terminated_length": 529.75,
      "epoch": 0.014333333333333333,
      "grad_norm": 0.5630760192871094,
      "kl": 0.0052032470703125,
      "learning_rate": 2.8000000000000003e-06,
      "loss": 0.0635,
      "num_tokens": 2187736.0,
      "reward": 0.662239596247673,
      "reward_std": 0.25620727613568306,
      "rewards/format_reward/mean": 0.78125,
      "rewards/format_reward/std": 0.4176512807607651,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.526041679084301,
      "rewards/reasoning_steps_reward/std": 0.3573034182190895,
      "rewards/tag_count_reward/mean": 0.8671875,
      "rewards/tag_count_reward/std": 0.26535360887646675,
      "step": 43
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.421875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 995.25,
      "completions/mean_length": 883.703125,
      "completions/mean_terminated_length": 805.6934661865234,
      "completions/min_length": 515.5,
      "completions/min_terminated_length": 515.5,
      "epoch": 0.014666666666666666,
      "grad_norm": 0.48884493112564087,
      "kl": 0.005573272705078125,
      "learning_rate": 2.866666666666667e-06,
      "loss": 0.0273,
      "num_tokens": 2253701.0,
      "reward": 1.0193318128585815,
      "reward_std": 0.5557461529970169,
      "rewards/format_reward/mean": 0.59375,
      "rewards/format_reward/std": 0.4000816270709038,
      "rewards/penalized_accuracy_reward/mean": 0.37349849939346313,
      "rewards/penalized_accuracy_reward/std": 0.41204380989074707,
      "rewards/reasoning_steps_reward/mean": 0.6822916716337204,
      "rewards/reasoning_steps_reward/std": 0.3374997489154339,
      "rewards/tag_count_reward/mean": 0.671875,
      "rewards/tag_count_reward/std": 0.2664684094488621,
      "step": 44
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 976.0,
      "completions/max_terminated_length": 902.75,
      "completions/mean_length": 705.34375,
      "completions/mean_terminated_length": 608.9917602539062,
      "completions/min_length": 300.25,
      "completions/min_terminated_length": 300.25,
      "epoch": 0.015,
      "grad_norm": 0.6433221697807312,
      "kl": 0.00946044921875,
      "learning_rate": 2.9333333333333338e-06,
      "loss": 0.037,
      "num_tokens": 2315707.0,
      "reward": 0.7077403664588928,
      "reward_std": 0.40935203433036804,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.41104350984096527,
      "rewards/penalized_accuracy_reward/mean": 0.06776641309261322,
      "rewards/penalized_accuracy_reward/std": 0.1868770569562912,
      "rewards/reasoning_steps_reward/mean": 0.5260416865348816,
      "rewards/reasoning_steps_reward/std": 0.3749267980456352,
      "rewards/tag_count_reward/mean": 0.76953125,
      "rewards/tag_count_reward/std": 0.34914373606443405,
      "step": 45
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 932.75,
      "completions/max_terminated_length": 898.0,
      "completions/mean_length": 656.46875,
      "completions/mean_terminated_length": 630.1919555664062,
      "completions/min_length": 415.5,
      "completions/min_terminated_length": 415.5,
      "epoch": 0.015333333333333332,
      "grad_norm": 0.7993950247764587,
      "kl": 0.0079345703125,
      "learning_rate": 3e-06,
      "loss": 0.0068,
      "num_tokens": 2368185.0,
      "reward": 0.868732750415802,
      "reward_std": 0.4208342842757702,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.18217839300632477,
      "rewards/penalized_accuracy_reward/mean": 0.14789938926696777,
      "rewards/penalized_accuracy_reward/std": 0.26476314663887024,
      "rewards/reasoning_steps_reward/mean": 0.5260416939854622,
      "rewards/reasoning_steps_reward/std": 0.3555455729365349,
      "rewards/tag_count_reward/mean": 0.953125,
      "rewards/tag_count_reward/std": 0.11129852384328842,
      "step": 46
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 831.25,
      "completions/max_terminated_length": 774.75,
      "completions/mean_length": 608.578125,
      "completions/mean_terminated_length": 544.359375,
      "completions/min_length": 243.25,
      "completions/min_terminated_length": 243.25,
      "epoch": 0.015666666666666666,
      "grad_norm": 1.5673341751098633,
      "kl": 0.05785369873046875,
      "learning_rate": 3.066666666666667e-06,
      "loss": 0.0451,
      "num_tokens": 2415102.0,
      "reward": 0.7765065282583237,
      "reward_std": 0.3577282950282097,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.3125,
      "rewards/penalized_accuracy_reward/mean": 0.06830339878797531,
      "rewards/penalized_accuracy_reward/std": 0.18774420022964478,
      "rewards/reasoning_steps_reward/mean": 0.6093750223517418,
      "rewards/reasoning_steps_reward/std": 0.36846283823251724,
      "rewards/tag_count_reward/mean": 0.84765625,
      "rewards/tag_count_reward/std": 0.2427973598241806,
      "step": 47
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.328125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 945.25,
      "completions/mean_length": 789.46875,
      "completions/mean_terminated_length": 684.9875183105469,
      "completions/min_length": 350.75,
      "completions/min_terminated_length": 350.75,
      "epoch": 0.016,
      "grad_norm": 0.5259022116661072,
      "kl": 0.006622314453125,
      "learning_rate": 3.133333333333334e-06,
      "loss": 0.0595,
      "num_tokens": 2477964.0,
      "reward": 0.5730468779802322,
      "reward_std": 0.2987581789493561,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.4622559919953346,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.5625000074505806,
      "rewards/reasoning_steps_reward/std": 0.3903362527489662,
      "rewards/tag_count_reward/mean": 0.66796875,
      "rewards/tag_count_reward/std": 0.3739175945520401,
      "step": 48
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 915.75,
      "completions/max_terminated_length": 856.5,
      "completions/mean_length": 685.359375,
      "completions/mean_terminated_length": 627.7360687255859,
      "completions/min_length": 309.25,
      "completions/min_terminated_length": 309.25,
      "epoch": 0.01633333333333333,
      "grad_norm": 0.7897787690162659,
      "kl": 0.008880615234375,
      "learning_rate": 3.2000000000000003e-06,
      "loss": 0.032,
      "num_tokens": 2530819.0,
      "reward": 0.6785156428813934,
      "reward_std": 0.21095674112439156,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.2882782220840454,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.5000000111758709,
      "rewards/reasoning_steps_reward/std": 0.34909606724977493,
      "rewards/tag_count_reward/mean": 0.91015625,
      "rewards/tag_count_reward/std": 0.16805679351091385,
      "step": 49
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1004.5,
      "completions/max_terminated_length": 952.5,
      "completions/mean_length": 680.25,
      "completions/mean_terminated_length": 654.5104370117188,
      "completions/min_length": 417.75,
      "completions/min_terminated_length": 417.75,
      "epoch": 0.016666666666666666,
      "grad_norm": 0.7008921504020691,
      "kl": 0.00914764404296875,
      "learning_rate": 3.266666666666667e-06,
      "loss": -0.0159,
      "num_tokens": 2584419.0,
      "reward": 0.7334635555744171,
      "reward_std": 0.20390507578849792,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.3375816270709038,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.598958358168602,
      "rewards/reasoning_steps_reward/std": 0.33922333642840385,
      "rewards/tag_count_reward/mean": 0.90234375,
      "rewards/tag_count_reward/std": 0.27025456726551056,
      "step": 50
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 961.75,
      "completions/max_terminated_length": 930.0,
      "completions/mean_length": 763.015625,
      "completions/mean_terminated_length": 721.3432312011719,
      "completions/min_length": 441.5,
      "completions/min_terminated_length": 441.5,
      "epoch": 0.017,
      "grad_norm": 0.686704158782959,
      "kl": 0.00669097900390625,
      "learning_rate": 3.3333333333333333e-06,
      "loss": 0.0438,
      "num_tokens": 2649380.0,
      "reward": 0.7923340648412704,
      "reward_std": 0.4311821572482586,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.41419370472431183,
      "rewards/penalized_accuracy_reward/mean": 0.07644861936569214,
      "rewards/penalized_accuracy_reward/std": 0.2088974118232727,
      "rewards/reasoning_steps_reward/mean": 0.6614583507180214,
      "rewards/reasoning_steps_reward/std": 0.3583720251917839,
      "rewards/tag_count_reward/mean": 0.8515625,
      "rewards/tag_count_reward/std": 0.20158234424889088,
      "step": 51
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 943.75,
      "completions/max_terminated_length": 938.25,
      "completions/mean_length": 719.15625,
      "completions/mean_terminated_length": 695.46875,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 0.017333333333333333,
      "grad_norm": 0.6101579070091248,
      "kl": 0.01004791259765625,
      "learning_rate": 3.4000000000000005e-06,
      "loss": -0.0173,
      "num_tokens": 2702990.0,
      "reward": 0.7369791567325592,
      "reward_std": 0.26093084178864956,
      "rewards/format_reward/mean": 0.671875,
      "rewards/format_reward/std": 0.36967839300632477,
      "rewards/penalized_accuracy_reward/mean": 0.01796874962747097,
      "rewards/penalized_accuracy_reward/std": 0.07187499850988388,
      "rewards/reasoning_steps_reward/mean": 0.7395833507180214,
      "rewards/reasoning_steps_reward/std": 0.29277897626161575,
      "rewards/tag_count_reward/mean": 0.8046875,
      "rewards/tag_count_reward/std": 0.27829742431640625,
      "step": 52
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 842.5,
      "completions/max_terminated_length": 837.25,
      "completions/mean_length": 655.0,
      "completions/mean_terminated_length": 639.6964569091797,
      "completions/min_length": 398.5,
      "completions/min_terminated_length": 398.5,
      "epoch": 0.017666666666666667,
      "grad_norm": 0.7164862751960754,
      "kl": 0.011871337890625,
      "learning_rate": 3.4666666666666672e-06,
      "loss": -0.0006,
      "num_tokens": 2755854.0,
      "reward": 0.6813784092664719,
      "reward_std": 0.3475731834769249,
      "rewards/format_reward/mean": 0.703125,
      "rewards/format_reward/std": 0.42046456038951874,
      "rewards/penalized_accuracy_reward/mean": 0.024607568979263306,
      "rewards/penalized_accuracy_reward/std": 0.09843027591705322,
      "rewards/reasoning_steps_reward/mean": 0.6041666865348816,
      "rewards/reasoning_steps_reward/std": 0.40745963156223297,
      "rewards/tag_count_reward/mean": 0.734375,
      "rewards/tag_count_reward/std": 0.34482838958501816,
      "step": 53
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 981.25,
      "completions/max_terminated_length": 956.25,
      "completions/mean_length": 735.3125,
      "completions/mean_terminated_length": 696.0399017333984,
      "completions/min_length": 310.0,
      "completions/min_terminated_length": 310.0,
      "epoch": 0.018,
      "grad_norm": 0.7103943824768066,
      "kl": 0.010498046875,
      "learning_rate": 3.5333333333333335e-06,
      "loss": -0.0591,
      "num_tokens": 2812194.0,
      "reward": 0.80135178565979,
      "reward_std": 0.3634731322526932,
      "rewards/format_reward/mean": 0.71875,
      "rewards/format_reward/std": 0.375,
      "rewards/penalized_accuracy_reward/mean": 0.05096115358173847,
      "rewards/penalized_accuracy_reward/std": 0.20384462922811508,
      "rewards/reasoning_steps_reward/mean": 0.7656250149011612,
      "rewards/reasoning_steps_reward/std": 0.3409550115466118,
      "rewards/tag_count_reward/mean": 0.80078125,
      "rewards/tag_count_reward/std": 0.2768521849066019,
      "step": 54
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 1000.25,
      "completions/max_terminated_length": 956.75,
      "completions/mean_length": 624.0,
      "completions/mean_terminated_length": 603.9885711669922,
      "completions/min_length": 288.25,
      "completions/min_terminated_length": 288.25,
      "epoch": 0.018333333333333333,
      "grad_norm": 0.840887725353241,
      "kl": 0.01261138916015625,
      "learning_rate": 3.6000000000000003e-06,
      "loss": -0.0931,
      "num_tokens": 2866658.0,
      "reward": 0.8203446567058563,
      "reward_std": 0.3611001707613468,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.34944770485162735,
      "rewards/penalized_accuracy_reward/mean": 0.07008424401283264,
      "rewards/penalized_accuracy_reward/std": 0.19155985116958618,
      "rewards/reasoning_steps_reward/mean": 0.630208358168602,
      "rewards/reasoning_steps_reward/std": 0.3516792505979538,
      "rewards/tag_count_reward/mean": 0.9140625,
      "rewards/tag_count_reward/std": 0.22830459102988243,
      "step": 55
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 640.75,
      "completions/max_terminated_length": 640.75,
      "completions/mean_length": 446.15625,
      "completions/mean_terminated_length": 446.15625,
      "completions/min_length": 296.5,
      "completions/min_terminated_length": 296.5,
      "epoch": 0.018666666666666668,
      "grad_norm": 0.9962576031684875,
      "kl": 0.020416259765625,
      "learning_rate": 3.6666666666666666e-06,
      "loss": -0.0914,
      "num_tokens": 2906220.0,
      "reward": 0.7946614772081375,
      "reward_std": 0.21071942150592804,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.24866948276758194,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.6822916865348816,
      "rewards/reasoning_steps_reward/std": 0.3173614516854286,
      "rewards/tag_count_reward/mean": 0.91015625,
      "rewards/tag_count_reward/std": 0.2787376195192337,
      "step": 56
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 932.5,
      "completions/max_terminated_length": 837.75,
      "completions/mean_length": 594.28125,
      "completions/mean_terminated_length": 565.7655563354492,
      "completions/min_length": 256.0,
      "completions/min_terminated_length": 256.0,
      "epoch": 0.019,
      "grad_norm": 0.7202043533325195,
      "kl": 0.0177154541015625,
      "learning_rate": 3.7333333333333337e-06,
      "loss": -0.0621,
      "num_tokens": 2953486.0,
      "reward": 0.8713552355766296,
      "reward_std": 0.468075692653656,
      "rewards/format_reward/mean": 0.765625,
      "rewards/format_reward/std": 0.29930340498685837,
      "rewards/penalized_accuracy_reward/mean": 0.1528656743466854,
      "rewards/penalized_accuracy_reward/std": 0.3525933623313904,
      "rewards/reasoning_steps_reward/mean": 0.6510416865348816,
      "rewards/reasoning_steps_reward/std": 0.35786738246679306,
      "rewards/tag_count_reward/mean": 0.8671875,
      "rewards/tag_count_reward/std": 0.24707800149917603,
      "step": 57
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 826.25,
      "completions/max_terminated_length": 800.5,
      "completions/mean_length": 588.96875,
      "completions/mean_terminated_length": 574.8820190429688,
      "completions/min_length": 285.5,
      "completions/min_terminated_length": 285.5,
      "epoch": 0.019333333333333334,
      "grad_norm": 0.7140520215034485,
      "kl": 0.02069091796875,
      "learning_rate": 3.8000000000000005e-06,
      "loss": -0.0343,
      "num_tokens": 3001212.0,
      "reward": 0.9257438629865646,
      "reward_std": 0.4081820733845234,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.27156074345111847,
      "rewards/penalized_accuracy_reward/mean": 0.09058760292828083,
      "rewards/penalized_accuracy_reward/std": 0.2755988612771034,
      "rewards/reasoning_steps_reward/mean": 0.7656250298023224,
      "rewards/reasoning_steps_reward/std": 0.32280419021844864,
      "rewards/tag_count_reward/mean": 0.9609375,
      "rewards/tag_count_reward/std": 0.10584449954330921,
      "step": 58
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 904.75,
      "completions/max_terminated_length": 853.5,
      "completions/mean_length": 612.609375,
      "completions/mean_terminated_length": 584.3977813720703,
      "completions/min_length": 297.5,
      "completions/min_terminated_length": 297.5,
      "epoch": 0.019666666666666666,
      "grad_norm": 0.6635720133781433,
      "kl": 0.0224456787109375,
      "learning_rate": 3.866666666666667e-06,
      "loss": 0.0093,
      "num_tokens": 3048851.0,
      "reward": 0.8723735809326172,
      "reward_std": 0.27260667085647583,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.3265564441680908,
      "rewards/penalized_accuracy_reward/mean": 0.035785011947155,
      "rewards/penalized_accuracy_reward/std": 0.14314004778862,
      "rewards/reasoning_steps_reward/mean": 0.786458358168602,
      "rewards/reasoning_steps_reward/std": 0.2604687921702862,
      "rewards/tag_count_reward/mean": 0.93359375,
      "rewards/tag_count_reward/std": 0.15990673378109932,
      "step": 59
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 850.25,
      "completions/max_terminated_length": 823.5,
      "completions/mean_length": 564.234375,
      "completions/mean_terminated_length": 533.5127105712891,
      "completions/min_length": 266.5,
      "completions/min_terminated_length": 266.5,
      "epoch": 0.02,
      "grad_norm": 0.6934652328491211,
      "kl": 0.030548095703125,
      "learning_rate": 3.9333333333333335e-06,
      "loss": -0.0195,
      "num_tokens": 3093298.0,
      "reward": 1.116418480873108,
      "reward_std": 0.5154721215367317,
      "rewards/format_reward/mean": 0.8125,
      "rewards/format_reward/std": 0.36435678601264954,
      "rewards/penalized_accuracy_reward/mean": 0.2634236477315426,
      "rewards/penalized_accuracy_reward/std": 0.39374517649412155,
      "rewards/reasoning_steps_reward/mean": 0.8802083283662796,
      "rewards/reasoning_steps_reward/std": 0.25018948689103127,
      "rewards/tag_count_reward/mean": 0.87890625,
      "rewards/tag_count_reward/std": 0.22917302697896957,
      "step": 60
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 814.25,
      "completions/max_terminated_length": 791.25,
      "completions/mean_length": 554.90625,
      "completions/mean_terminated_length": 549.4541778564453,
      "completions/min_length": 225.5,
      "completions/min_terminated_length": 225.5,
      "epoch": 0.02033333333333333,
      "grad_norm": 0.8435772061347961,
      "kl": 0.02685546875,
      "learning_rate": 4.000000000000001e-06,
      "loss": -0.0585,
      "num_tokens": 3138060.0,
      "reward": 0.9362861067056656,
      "reward_std": 0.2588004246354103,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.1875,
      "rewards/penalized_accuracy_reward/mean": 0.029124625027179718,
      "rewards/penalized_accuracy_reward/std": 0.11649850755929947,
      "rewards/reasoning_steps_reward/mean": 0.8541667014360428,
      "rewards/reasoning_steps_reward/std": 0.26949557289481163,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 61
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 921.75,
      "completions/max_terminated_length": 904.75,
      "completions/mean_length": 613.109375,
      "completions/mean_terminated_length": 601.5496215820312,
      "completions/min_length": 356.25,
      "completions/min_terminated_length": 356.25,
      "epoch": 0.020666666666666667,
      "grad_norm": 0.7626720070838928,
      "kl": 0.025482177734375,
      "learning_rate": 4.066666666666667e-06,
      "loss": -0.0778,
      "num_tokens": 3188611.0,
      "reward": 0.9843322783708572,
      "reward_std": 0.3281868249177933,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.18616948276758194,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.9010416865348816,
      "rewards/reasoning_steps_reward/std": 0.2222483716905117,
      "rewards/tag_count_reward/mean": 0.953125,
      "rewards/tag_count_reward/std": 0.12410355359315872,
      "step": 62
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 897.25,
      "completions/max_terminated_length": 882.0,
      "completions/mean_length": 638.796875,
      "completions/mean_terminated_length": 612.8735809326172,
      "completions/min_length": 285.75,
      "completions/min_terminated_length": 285.75,
      "epoch": 0.021,
      "grad_norm": 0.7839279770851135,
      "kl": 0.038116455078125,
      "learning_rate": 4.133333333333333e-06,
      "loss": -0.0376,
      "num_tokens": 3237910.0,
      "reward": 0.8942708224058151,
      "reward_std": 0.1893441639840603,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.24467839300632477,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.8854166865348816,
      "rewards/reasoning_steps_reward/std": 0.24039705470204353,
      "rewards/tag_count_reward/mean": 0.953125,
      "rewards/tag_count_reward/std": 0.09034235030412674,
      "step": 63
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 797.25,
      "completions/max_terminated_length": 769.75,
      "completions/mean_length": 527.171875,
      "completions/mean_terminated_length": 521.1739654541016,
      "completions/min_length": 322.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 0.021333333333333333,
      "grad_norm": 0.7821366786956787,
      "kl": 0.030670166015625,
      "learning_rate": 4.2000000000000004e-06,
      "loss": -0.0015,
      "num_tokens": 3280753.0,
      "reward": 1.0909536629915237,
      "reward_std": 0.4102521315217018,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.24866948276758194,
      "rewards/penalized_accuracy_reward/mean": 0.1860057171434164,
      "rewards/penalized_accuracy_reward/std": 0.32973112910985947,
      "rewards/reasoning_steps_reward/mean": 0.895833358168602,
      "rewards/reasoning_steps_reward/std": 0.1899307519197464,
      "rewards/tag_count_reward/mean": 0.9453125,
      "rewards/tag_count_reward/std": 0.15195956081151962,
      "step": 64
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 1001.0,
      "completions/max_terminated_length": 874.0,
      "completions/mean_length": 681.328125,
      "completions/mean_terminated_length": 620.4050750732422,
      "completions/min_length": 312.0,
      "completions/min_terminated_length": 312.0,
      "epoch": 0.021666666666666667,
      "grad_norm": 0.7356759309768677,
      "kl": 0.02301025390625,
      "learning_rate": 4.266666666666668e-06,
      "loss": 0.1044,
      "num_tokens": 3334966.0,
      "reward": 0.8970052152872086,
      "reward_std": 0.18142974004149437,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.3683478757739067,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9479167014360428,
      "rewards/reasoning_steps_reward/std": 0.1763468012213707,
      "rewards/tag_count_reward/mean": 0.91796875,
      "rewards/tag_count_reward/std": 0.1946777980774641,
      "step": 65
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 958.25,
      "completions/max_terminated_length": 885.5,
      "completions/mean_length": 617.75,
      "completions/mean_terminated_length": 591.3373718261719,
      "completions/min_length": 338.5,
      "completions/min_terminated_length": 338.5,
      "epoch": 0.022,
      "grad_norm": 0.7183022499084473,
      "kl": 0.02606201171875,
      "learning_rate": 4.333333333333334e-06,
      "loss": 0.1036,
      "num_tokens": 3383014.0,
      "reward": 1.533137023448944,
      "reward_std": 0.23449738323688507,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.2750816270709038,
      "rewards/penalized_accuracy_reward/mean": 0.6005849502980709,
      "rewards/penalized_accuracy_reward/std": 0.13191417790949345,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.05442607030272484,
      "rewards/tag_count_reward/mean": 0.9296875,
      "rewards/tag_count_reward/std": 0.17777499184012413,
      "step": 66
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 972.5,
      "completions/max_terminated_length": 890.5,
      "completions/mean_length": 744.75,
      "completions/mean_terminated_length": 720.5144195556641,
      "completions/min_length": 490.25,
      "completions/min_terminated_length": 490.25,
      "epoch": 0.022333333333333334,
      "grad_norm": 0.6722339987754822,
      "kl": 0.02935791015625,
      "learning_rate": 4.4e-06,
      "loss": 0.0201,
      "num_tokens": 3442262.0,
      "reward": 0.9450913518667221,
      "reward_std": 0.21717733424156904,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.24467839300632477,
      "rewards/penalized_accuracy_reward/mean": 0.02477882243692875,
      "rewards/penalized_accuracy_reward/std": 0.099115289747715,
      "rewards/reasoning_steps_reward/mean": 0.9375000149011612,
      "rewards/reasoning_steps_reward/std": 0.1712810881435871,
      "rewards/tag_count_reward/mean": 0.953125,
      "rewards/tag_count_reward/std": 0.11574538052082062,
      "step": 67
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 946.25,
      "completions/max_terminated_length": 927.0,
      "completions/mean_length": 760.0,
      "completions/mean_terminated_length": 719.6666870117188,
      "completions/min_length": 486.75,
      "completions/min_terminated_length": 486.75,
      "epoch": 0.02266666666666667,
      "grad_norm": 0.497885525226593,
      "kl": 0.029876708984375,
      "learning_rate": 4.4666666666666665e-06,
      "loss": 0.0258,
      "num_tokens": 3499686.0,
      "reward": 0.8750000149011612,
      "reward_std": 0.14313609153032303,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.30717839300632477,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9687500149011612,
      "rewards/reasoning_steps_reward/std": 0.10621638596057892,
      "rewards/tag_count_reward/mean": 0.90625,
      "rewards/tag_count_reward/std": 0.13139523938298225,
      "step": 68
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 968.5,
      "completions/max_terminated_length": 953.0,
      "completions/mean_length": 754.375,
      "completions/mean_terminated_length": 725.8952026367188,
      "completions/min_length": 478.0,
      "completions/min_terminated_length": 478.0,
      "epoch": 0.023,
      "grad_norm": 0.6058640480041504,
      "kl": 0.026092529296875,
      "learning_rate": 4.533333333333334e-06,
      "loss": 0.0663,
      "num_tokens": 3561918.0,
      "reward": 0.933966264128685,
      "reward_std": 0.4210771322250366,
      "rewards/format_reward/mean": 0.8125,
      "rewards/format_reward/std": 0.3265564441680908,
      "rewards/penalized_accuracy_reward/mean": 0.07641417533159256,
      "rewards/penalized_accuracy_reward/std": 0.30565670132637024,
      "rewards/reasoning_steps_reward/mean": 0.8854166865348816,
      "rewards/reasoning_steps_reward/std": 0.22727786377072334,
      "rewards/tag_count_reward/mean": 0.8984375,
      "rewards/tag_count_reward/std": 0.19980989769101143,
      "step": 69
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 892.25,
      "completions/max_terminated_length": 783.5,
      "completions/mean_length": 589.015625,
      "completions/mean_terminated_length": 570.8385620117188,
      "completions/min_length": 378.5,
      "completions/min_terminated_length": 378.5,
      "epoch": 0.023333333333333334,
      "grad_norm": 0.7925571203231812,
      "kl": 0.038665771484375,
      "learning_rate": 4.600000000000001e-06,
      "loss": 0.0554,
      "num_tokens": 3609519.0,
      "reward": 0.9950062930583954,
      "reward_std": 0.18352606147527695,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.14789126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.02547505497932434,
      "rewards/penalized_accuracy_reward/std": 0.10190021991729736,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.0624999962747097,
      "rewards/tag_count_reward/mean": 0.9609375,
      "rewards/tag_count_reward/std": 0.1292813941836357,
      "step": 70
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1017.5,
      "completions/max_terminated_length": 995.5,
      "completions/mean_length": 649.046875,
      "completions/mean_terminated_length": 618.1365280151367,
      "completions/min_length": 380.5,
      "completions/min_terminated_length": 380.5,
      "epoch": 0.023666666666666666,
      "grad_norm": 0.7724472284317017,
      "kl": 0.03399658203125,
      "learning_rate": 4.666666666666667e-06,
      "loss": 0.108,
      "num_tokens": 3661650.0,
      "reward": 0.9970489591360092,
      "reward_std": 0.30074702948331833,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.3450859263539314,
      "rewards/penalized_accuracy_reward/mean": 0.06775209307670593,
      "rewards/penalized_accuracy_reward/std": 0.18513934314250946,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.0624999962747097,
      "rewards/tag_count_reward/mean": 0.93359375,
      "rewards/tag_count_reward/std": 0.1695580966770649,
      "step": 71
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 933.75,
      "completions/max_terminated_length": 917.0,
      "completions/mean_length": 641.21875,
      "completions/mean_terminated_length": 631.3729400634766,
      "completions/min_length": 379.75,
      "completions/min_terminated_length": 379.75,
      "epoch": 0.024,
      "grad_norm": 0.5506178736686707,
      "kl": 0.03662109375,
      "learning_rate": 4.7333333333333335e-06,
      "loss": -0.0149,
      "num_tokens": 3713456.0,
      "reward": 1.0036645531654358,
      "reward_std": 0.20802644453942776,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.14789126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.03712809830904007,
      "rewards/penalized_accuracy_reward/std": 0.14851240813732147,
      "rewards/reasoning_steps_reward/mean": 0.9791666865348816,
      "rewards/reasoning_steps_reward/std": 0.07013041526079178,
      "rewards/tag_count_reward/mean": 0.95703125,
      "rewards/tag_count_reward/std": 0.11938536167144775,
      "step": 72
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 698.5,
      "completions/max_terminated_length": 698.5,
      "completions/mean_length": 423.875,
      "completions/mean_terminated_length": 423.875,
      "completions/min_length": 269.25,
      "completions/min_terminated_length": 269.25,
      "epoch": 0.024333333333333332,
      "grad_norm": 0.859659731388092,
      "kl": 0.04425048828125,
      "learning_rate": 4.800000000000001e-06,
      "loss": 0.0026,
      "num_tokens": 3748776.0,
      "reward": 1.2820918262004852,
      "reward_std": 0.6469196081161499,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.3676386624574661,
      "rewards/penalized_accuracy_reward/std": 0.6582163870334625,
      "rewards/reasoning_steps_reward/mean": 0.859375,
      "rewards/reasoning_steps_reward/std": 0.2520834319293499,
      "rewards/tag_count_reward/mean": 0.97265625,
      "rewards/tag_count_reward/std": 0.07966844737529755,
      "step": 73
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 763.5,
      "completions/max_terminated_length": 763.5,
      "completions/mean_length": 542.515625,
      "completions/mean_terminated_length": 542.515625,
      "completions/min_length": 310.0,
      "completions/min_terminated_length": 310.0,
      "epoch": 0.024666666666666667,
      "grad_norm": 0.5363684892654419,
      "kl": 0.0386962890625,
      "learning_rate": 4.866666666666667e-06,
      "loss": -0.0286,
      "num_tokens": 3794937.0,
      "reward": 1.0115860998630524,
      "reward_std": 0.21859073173254728,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0546850711107254,
      "rewards/penalized_accuracy_reward/std": 0.15053680539131165,
      "rewards/reasoning_steps_reward/mean": 0.9270833432674408,
      "rewards/reasoning_steps_reward/std": 0.1448565348982811,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 74
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 778.5,
      "completions/max_terminated_length": 766.5,
      "completions/mean_length": 561.4375,
      "completions/mean_terminated_length": 557.7208404541016,
      "completions/min_length": 328.75,
      "completions/min_terminated_length": 328.75,
      "epoch": 0.025,
      "grad_norm": 0.8160569667816162,
      "kl": 0.041351318359375,
      "learning_rate": 4.933333333333334e-06,
      "loss": 0.021,
      "num_tokens": 3839589.0,
      "reward": 1.1652396470308304,
      "reward_std": 0.28621215745806694,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.25,
      "rewards/penalized_accuracy_reward/mean": 0.27513551712036133,
      "rewards/penalized_accuracy_reward/std": 0.16930751502513885,
      "rewards/reasoning_steps_reward/mean": 0.833333358168602,
      "rewards/reasoning_steps_reward/std": 0.19271302968263626,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.0625,
      "step": 75
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 697.0,
      "completions/max_terminated_length": 697.0,
      "completions/mean_length": 449.75,
      "completions/mean_terminated_length": 449.75,
      "completions/min_length": 257.5,
      "completions/min_terminated_length": 257.5,
      "epoch": 0.025333333333333333,
      "grad_norm": 0.8491156697273254,
      "kl": 0.05242919921875,
      "learning_rate": 5e-06,
      "loss": -0.0402,
      "num_tokens": 3879173.0,
      "reward": 0.9087462574243546,
      "reward_std": 0.22535051591694355,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.2050696536898613,
      "rewards/penalized_accuracy_reward/mean": 0.024371251463890076,
      "rewards/penalized_accuracy_reward/std": 0.0974850058555603,
      "rewards/reasoning_steps_reward/mean": 0.8750000149011612,
      "rewards/reasoning_steps_reward/std": 0.18622694537043571,
      "rewards/tag_count_reward/mean": 0.90625,
      "rewards/tag_count_reward/std": 0.21993406862020493,
      "step": 76
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 646.0,
      "completions/max_terminated_length": 646.0,
      "completions/mean_length": 466.328125,
      "completions/mean_terminated_length": 466.328125,
      "completions/min_length": 281.25,
      "completions/min_terminated_length": 281.25,
      "epoch": 0.025666666666666667,
      "grad_norm": 0.913472056388855,
      "kl": 0.046905517578125,
      "learning_rate": 5.0666666666666676e-06,
      "loss": 0.0071,
      "num_tokens": 3920298.0,
      "reward": 0.9114020764827728,
      "reward_std": 0.2558419294655323,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.3058478757739067,
      "rewards/penalized_accuracy_reward/mean": 0.028198951855301857,
      "rewards/penalized_accuracy_reward/std": 0.11279580742120743,
      "rewards/reasoning_steps_reward/mean": 0.9062500149011612,
      "rewards/reasoning_steps_reward/std": 0.17976614087820053,
      "rewards/tag_count_reward/mean": 0.92578125,
      "rewards/tag_count_reward/std": 0.1920349784195423,
      "step": 77
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 699.75,
      "completions/max_terminated_length": 552.0,
      "completions/mean_length": 378.46875,
      "completions/mean_terminated_length": 350.8145980834961,
      "completions/min_length": 186.75,
      "completions/min_terminated_length": 186.75,
      "epoch": 0.026,
      "grad_norm": 1.1382102966308594,
      "kl": 0.0594482421875,
      "learning_rate": 5.133333333333334e-06,
      "loss": 0.1165,
      "num_tokens": 3953512.0,
      "reward": 0.8847656399011612,
      "reward_std": 0.16677778400480747,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.29578252136707306,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.8593750298023224,
      "rewards/reasoning_steps_reward/std": 0.19023765996098518,
      "rewards/tag_count_reward/mean": 0.92578125,
      "rewards/tag_count_reward/std": 0.20477662421762943,
      "step": 78
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 744.75,
      "completions/max_terminated_length": 744.75,
      "completions/mean_length": 499.765625,
      "completions/mean_terminated_length": 499.765625,
      "completions/min_length": 295.75,
      "completions/min_terminated_length": 295.75,
      "epoch": 0.026333333333333334,
      "grad_norm": 0.8621724843978882,
      "kl": 0.0494384765625,
      "learning_rate": 5.2e-06,
      "loss": 0.0036,
      "num_tokens": 3996825.0,
      "reward": 0.9801009744405746,
      "reward_std": 0.2303646355867386,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.1875,
      "rewards/penalized_accuracy_reward/mean": 0.033877044916152954,
      "rewards/penalized_accuracy_reward/std": 0.135508194565773,
      "rewards/reasoning_steps_reward/mean": 0.9322916865348816,
      "rewards/reasoning_steps_reward/std": 0.1747187376022339,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 79
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 712.5,
      "completions/max_terminated_length": 661.5,
      "completions/mean_length": 467.015625,
      "completions/mean_terminated_length": 460.1291809082031,
      "completions/min_length": 278.75,
      "completions/min_terminated_length": 278.75,
      "epoch": 0.02666666666666667,
      "grad_norm": 0.8190663456916809,
      "kl": 0.06005859375,
      "learning_rate": 5.2666666666666665e-06,
      "loss": -0.0029,
      "num_tokens": 4036218.0,
      "reward": 1.0388395190238953,
      "reward_std": 0.3393943142145872,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.08539126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.093787407502532,
      "rewards/penalized_accuracy_reward/std": 0.27905965596437454,
      "rewards/reasoning_steps_reward/mean": 0.9166666716337204,
      "rewards/reasoning_steps_reward/std": 0.1558472253382206,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.021347815170884132,
      "step": 80
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 749.25,
      "completions/max_terminated_length": 721.25,
      "completions/mean_length": 470.453125,
      "completions/mean_terminated_length": 463.5166778564453,
      "completions/min_length": 237.75,
      "completions/min_terminated_length": 237.75,
      "epoch": 0.027,
      "grad_norm": 0.7197569012641907,
      "kl": 0.0582275390625,
      "learning_rate": 5.333333333333334e-06,
      "loss": 0.0067,
      "num_tokens": 4080423.0,
      "reward": 1.0867950171232224,
      "reward_std": 0.3874451629817486,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.14789126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.12611790746450424,
      "rewards/penalized_accuracy_reward/std": 0.3361714631319046,
      "rewards/reasoning_steps_reward/mean": 0.9635416865348816,
      "rewards/reasoning_steps_reward/std": 0.10776668787002563,
      "rewards/tag_count_reward/mean": 0.9765625,
      "rewards/tag_count_reward/std": 0.08384781517088413,
      "step": 81
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 781.25,
      "completions/max_terminated_length": 774.25,
      "completions/mean_length": 486.921875,
      "completions/mean_terminated_length": 479.765625,
      "completions/min_length": 237.25,
      "completions/min_terminated_length": 237.25,
      "epoch": 0.027333333333333334,
      "grad_norm": 0.7567655444145203,
      "kl": 0.062744140625,
      "learning_rate": 5.400000000000001e-06,
      "loss": -0.0118,
      "num_tokens": 4121794.0,
      "reward": 1.0785975009202957,
      "reward_std": 0.4485790819162503,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.14789126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.15333710610866547,
      "rewards/penalized_accuracy_reward/std": 0.34840136766433716,
      "rewards/reasoning_steps_reward/mean": 0.8958333432674408,
      "rewards/reasoning_steps_reward/std": 0.17417392134666443,
      "rewards/tag_count_reward/mean": 0.9609375,
      "rewards/tag_count_reward/std": 0.10694126039743423,
      "step": 82
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 732.25,
      "completions/max_terminated_length": 732.25,
      "completions/mean_length": 482.203125,
      "completions/mean_terminated_length": 482.203125,
      "completions/min_length": 227.5,
      "completions/min_terminated_length": 227.5,
      "epoch": 0.027666666666666666,
      "grad_norm": 0.7868958115577698,
      "kl": 0.06304931640625,
      "learning_rate": 5.466666666666667e-06,
      "loss": 0.0333,
      "num_tokens": 4163967.0,
      "reward": 1.3195671439170837,
      "reward_std": 0.5870583718642592,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.3366243988275528,
      "rewards/penalized_accuracy_reward/std": 0.538981094956398,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.98046875,
      "rewards/tag_count_reward/std": 0.078125,
      "step": 83
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 846.0,
      "completions/max_terminated_length": 820.75,
      "completions/mean_length": 528.234375,
      "completions/mean_terminated_length": 521.2843780517578,
      "completions/min_length": 270.25,
      "completions/min_terminated_length": 270.25,
      "epoch": 0.028,
      "grad_norm": 0.6654375195503235,
      "kl": 0.05926513671875,
      "learning_rate": 5.533333333333334e-06,
      "loss": -0.0567,
      "num_tokens": 4208846.0,
      "reward": 1.0562476068735123,
      "reward_std": 0.35241691023111343,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.09452884644269943,
      "rewards/penalized_accuracy_reward/std": 0.2987591028213501,
      "rewards/reasoning_steps_reward/mean": 0.9375000149011612,
      "rewards/reasoning_steps_reward/std": 0.17659492790699005,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 84
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 892.5,
      "completions/max_terminated_length": 805.0,
      "completions/mean_length": 576.078125,
      "completions/mean_terminated_length": 568.4114685058594,
      "completions/min_length": 329.5,
      "completions/min_terminated_length": 329.5,
      "epoch": 0.028333333333333332,
      "grad_norm": 0.8431299328804016,
      "kl": 0.0531005859375,
      "learning_rate": 5.600000000000001e-06,
      "loss": 0.0185,
      "num_tokens": 4259827.0,
      "reward": 0.9817708432674408,
      "reward_std": 0.06735242495778948,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.96875,
      "rewards/tag_count_reward/std": 0.11091229319572449,
      "step": 85
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 640.75,
      "completions/max_terminated_length": 595.5,
      "completions/mean_length": 413.34375,
      "completions/mean_terminated_length": 407.17189025878906,
      "completions/min_length": 213.25,
      "completions/min_terminated_length": 213.25,
      "epoch": 0.028666666666666667,
      "grad_norm": 0.8590614795684814,
      "kl": 0.07122802734375,
      "learning_rate": 5.666666666666667e-06,
      "loss": 0.0421,
      "num_tokens": 4296057.0,
      "reward": 1.1069028824567795,
      "reward_std": 0.3679976146668196,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.21039126068353653,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.911458358168602,
      "rewards/reasoning_steps_reward/std": 0.2015480175614357,
      "rewards/tag_count_reward/mean": 0.98046875,
      "rewards/tag_count_reward/std": 0.06524410098791122,
      "step": 86
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 989.0,
      "completions/max_terminated_length": 981.0,
      "completions/mean_length": 705.484375,
      "completions/mean_terminated_length": 672.3079986572266,
      "completions/min_length": 404.5,
      "completions/min_terminated_length": 404.5,
      "epoch": 0.029,
      "grad_norm": 0.5832537412643433,
      "kl": 0.047698974609375,
      "learning_rate": 5.733333333333334e-06,
      "loss": 0.0553,
      "num_tokens": 4352808.0,
      "reward": 0.9296875,
      "reward_std": 0.10884983465075493,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.21347815543413162,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.04929708316922188,
      "rewards/tag_count_reward/mean": 0.9375,
      "rewards/tag_count_reward/std": 0.12680982053279877,
      "step": 87
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 911.5,
      "completions/max_terminated_length": 840.25,
      "completions/mean_length": 578.09375,
      "completions/mean_terminated_length": 570.7739715576172,
      "completions/min_length": 302.0,
      "completions/min_terminated_length": 302.0,
      "epoch": 0.029333333333333333,
      "grad_norm": 0.8061636686325073,
      "kl": 0.06048583984375,
      "learning_rate": 5.8e-06,
      "loss": -0.0503,
      "num_tokens": 4400094.0,
      "reward": 1.097521647810936,
      "reward_std": 0.2972461935132742,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.1558550000190735,
      "rewards/penalized_accuracy_reward/std": 0.18377679586410522,
      "rewards/reasoning_steps_reward/mean": 0.911458358168602,
      "rewards/reasoning_steps_reward/std": 0.20518534630537033,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.0625,
      "step": 88
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 875.25,
      "completions/max_terminated_length": 813.0,
      "completions/mean_length": 573.25,
      "completions/mean_terminated_length": 557.3623657226562,
      "completions/min_length": 342.0,
      "completions/min_terminated_length": 342.0,
      "epoch": 0.029666666666666668,
      "grad_norm": 0.8729704022407532,
      "kl": 0.05291748046875,
      "learning_rate": 5.8666666666666675e-06,
      "loss": -0.0207,
      "num_tokens": 4448718.0,
      "reward": 0.9381932765245438,
      "reward_std": 0.2505082078278065,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.31116948276758194,
      "rewards/penalized_accuracy_reward/mean": 0.029729731380939484,
      "rewards/penalized_accuracy_reward/std": 0.11891893297433853,
      "rewards/reasoning_steps_reward/mean": 0.9114583432674408,
      "rewards/reasoning_steps_reward/std": 0.14012115448713303,
      "rewards/tag_count_reward/mean": 0.96484375,
      "rewards/tag_count_reward/std": 0.10485684871673584,
      "step": 89
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 947.5,
      "completions/max_terminated_length": 923.25,
      "completions/mean_length": 628.734375,
      "completions/mean_terminated_length": 622.8916778564453,
      "completions/min_length": 381.75,
      "completions/min_terminated_length": 381.75,
      "epoch": 0.03,
      "grad_norm": 0.705189049243927,
      "kl": 0.0528564453125,
      "learning_rate": 5.933333333333335e-06,
      "loss": 0.0152,
      "num_tokens": 4500989.0,
      "reward": 0.9658854156732559,
      "reward_std": 0.10832381062209606,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.1875,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.973958358168602,
      "rewards/reasoning_steps_reward/std": 0.09096374735236168,
      "rewards/tag_count_reward/mean": 0.9765625,
      "rewards/tag_count_reward/std": 0.09375,
      "step": 90
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 1011.75,
      "completions/max_terminated_length": 1001.0,
      "completions/mean_length": 716.953125,
      "completions/mean_terminated_length": 704.4674224853516,
      "completions/min_length": 464.75,
      "completions/min_terminated_length": 464.75,
      "epoch": 0.030333333333333334,
      "grad_norm": 0.6014336347579956,
      "kl": 0.04913330078125,
      "learning_rate": 6e-06,
      "loss": 0.0312,
      "num_tokens": 4559210.0,
      "reward": 1.0703469514846802,
      "reward_std": 0.3709853794425726,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.2640564441680908,
      "rewards/penalized_accuracy_reward/mean": 0.13141468167304993,
      "rewards/penalized_accuracy_reward/std": 0.2352112978696823,
      "rewards/reasoning_steps_reward/mean": 0.973958358168602,
      "rewards/reasoning_steps_reward/std": 0.1041666604578495,
      "rewards/tag_count_reward/mean": 0.95703125,
      "rewards/tag_count_reward/std": 0.10916591435670853,
      "step": 91
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 875.75,
      "completions/max_terminated_length": 857.75,
      "completions/mean_length": 622.109375,
      "completions/mean_terminated_length": 608.2596282958984,
      "completions/min_length": 389.25,
      "completions/min_terminated_length": 389.25,
      "epoch": 0.030666666666666665,
      "grad_norm": 0.699589729309082,
      "kl": 0.0509033203125,
      "learning_rate": 6.066666666666667e-06,
      "loss": 0.0111,
      "num_tokens": 4606721.0,
      "reward": 1.1923158913850784,
      "reward_std": 0.35641849786043167,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.23680340498685837,
      "rewards/penalized_accuracy_reward/mean": 0.24192526936531067,
      "rewards/penalized_accuracy_reward/std": 0.28356412053108215,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.03359273821115494,
      "rewards/tag_count_reward/mean": 0.95703125,
      "rewards/tag_count_reward/std": 0.14332501962780952,
      "step": 92
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 940.5,
      "completions/max_terminated_length": 908.25,
      "completions/mean_length": 607.8125,
      "completions/mean_terminated_length": 602.2208404541016,
      "completions/min_length": 317.75,
      "completions/min_terminated_length": 317.75,
      "epoch": 0.031,
      "grad_norm": 1.3742492198944092,
      "kl": 0.0843505859375,
      "learning_rate": 6.133333333333334e-06,
      "loss": -0.0486,
      "num_tokens": 4656085.0,
      "reward": 0.9291666746139526,
      "reward_std": 0.14688335917890072,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.23328252136707306,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.927083358168602,
      "rewards/reasoning_steps_reward/std": 0.14779141545295715,
      "rewards/tag_count_reward/mean": 0.96875,
      "rewards/tag_count_reward/std": 0.10037772543728352,
      "step": 93
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 874.75,
      "completions/max_terminated_length": 818.25,
      "completions/mean_length": 534.53125,
      "completions/mean_terminated_length": 527.3885498046875,
      "completions/min_length": 350.25,
      "completions/min_terminated_length": 350.25,
      "epoch": 0.03133333333333333,
      "grad_norm": 0.8657882809638977,
      "kl": 0.06103515625,
      "learning_rate": 6.200000000000001e-06,
      "loss": 0.0349,
      "num_tokens": 4701863.0,
      "reward": 0.9688801914453506,
      "reward_std": 0.10679070092737675,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.1875,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.06615880131721497,
      "rewards/tag_count_reward/mean": 0.98046875,
      "rewards/tag_count_reward/std": 0.078125,
      "step": 94
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 866.25,
      "completions/max_terminated_length": 838.75,
      "completions/mean_length": 599.34375,
      "completions/mean_terminated_length": 593.5364685058594,
      "completions/min_length": 338.75,
      "completions/min_terminated_length": 338.75,
      "epoch": 0.03166666666666667,
      "grad_norm": 0.7609896063804626,
      "kl": 0.06109619140625,
      "learning_rate": 6.266666666666668e-06,
      "loss": -0.011,
      "num_tokens": 4748829.0,
      "reward": 0.9533854424953461,
      "reward_std": 0.11424364056438208,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.2257782220840454,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.08699213340878487,
      "rewards/tag_count_reward/mean": 0.9765625,
      "rewards/tag_count_reward/std": 0.07206955552101135,
      "step": 95
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 840.75,
      "completions/max_terminated_length": 840.75,
      "completions/mean_length": 591.15625,
      "completions/mean_terminated_length": 591.15625,
      "completions/min_length": 321.0,
      "completions/min_terminated_length": 321.0,
      "epoch": 0.032,
      "grad_norm": 0.7026406526565552,
      "kl": 0.05560302734375,
      "learning_rate": 6.333333333333333e-06,
      "loss": -0.0253,
      "num_tokens": 4796583.0,
      "reward": 1.0508922636508942,
      "reward_std": 0.2978524469071999,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.14789126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.1111786812543869,
      "rewards/penalized_accuracy_reward/std": 0.19907879829406738,
      "rewards/reasoning_steps_reward/mean": 0.9270833432674408,
      "rewards/reasoning_steps_reward/std": 0.12794098258018494,
      "rewards/tag_count_reward/mean": 0.94921875,
      "rewards/tag_count_reward/std": 0.12456496804952621,
      "step": 96
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 809.25,
      "completions/max_terminated_length": 809.25,
      "completions/mean_length": 510.171875,
      "completions/mean_terminated_length": 510.171875,
      "completions/min_length": 285.0,
      "completions/min_terminated_length": 285.0,
      "epoch": 0.03233333333333333,
      "grad_norm": 0.8602376580238342,
      "kl": 0.0557861328125,
      "learning_rate": 6.4000000000000006e-06,
      "loss": -0.0679,
      "num_tokens": 4844402.0,
      "reward": 1.120245411992073,
      "reward_std": 0.32840642519295216,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.17519332468509674,
      "rewards/penalized_accuracy_reward/std": 0.23455476760864258,
      "rewards/reasoning_steps_reward/mean": 0.9166666865348816,
      "rewards/reasoning_steps_reward/std": 0.19030534476041794,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 97
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 884.0,
      "completions/max_terminated_length": 884.0,
      "completions/mean_length": 565.53125,
      "completions/mean_terminated_length": 565.53125,
      "completions/min_length": 272.5,
      "completions/min_terminated_length": 272.5,
      "epoch": 0.03266666666666666,
      "grad_norm": 0.4225853979587555,
      "kl": 0.05230712890625,
      "learning_rate": 6.466666666666667e-06,
      "loss": -0.0753,
      "num_tokens": 4891300.0,
      "reward": 0.9843750149011612,
      "reward_std": 0.04488958604633808,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9687500149011612,
      "rewards/reasoning_steps_reward/std": 0.08977919071912766,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 98
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 699.25,
      "completions/max_terminated_length": 699.25,
      "completions/mean_length": 452.25,
      "completions/mean_terminated_length": 452.25,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "epoch": 0.033,
      "grad_norm": 0.8484078049659729,
      "kl": 0.05780029296875,
      "learning_rate": 6.533333333333334e-06,
      "loss": -0.1158,
      "num_tokens": 4930772.0,
      "reward": 1.023941695690155,
      "reward_std": 0.24003357347100973,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.07224894315004349,
      "rewards/penalized_accuracy_reward/std": 0.15540502965450287,
      "rewards/reasoning_steps_reward/mean": 0.9166667014360428,
      "rewards/reasoning_steps_reward/std": 0.19612576067447662,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 99
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 823.75,
      "completions/max_terminated_length": 813.0,
      "completions/mean_length": 513.296875,
      "completions/mean_terminated_length": 507.20314025878906,
      "completions/min_length": 221.75,
      "completions/min_terminated_length": 221.75,
      "epoch": 0.03333333333333333,
      "grad_norm": 0.8136687874794006,
      "kl": 0.049072265625,
      "learning_rate": 6.600000000000001e-06,
      "loss": -0.0653,
      "num_tokens": 4972919.0,
      "reward": 0.8863281309604645,
      "reward_std": 0.17739969119429588,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.23328252136707306,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.8437500149011612,
      "rewards/reasoning_steps_reward/std": 0.22764474898576736,
      "rewards/tag_count_reward/mean": 0.95703125,
      "rewards/tag_count_reward/std": 0.15899410098791122,
      "step": 100
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 705.25,
      "completions/max_terminated_length": 705.25,
      "completions/mean_length": 473.109375,
      "completions/mean_terminated_length": 473.109375,
      "completions/min_length": 202.25,
      "completions/min_terminated_length": 202.25,
      "epoch": 0.033666666666666664,
      "grad_norm": 0.9104421138763428,
      "kl": 0.05877685546875,
      "learning_rate": 6.666666666666667e-06,
      "loss": -0.0692,
      "num_tokens": 5011790.0,
      "reward": 1.0768784284591675,
      "reward_std": 0.5057692248374224,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.1875,
      "rewards/penalized_accuracy_reward/mean": 0.19367528706789017,
      "rewards/penalized_accuracy_reward/std": 0.417163223028183,
      "rewards/reasoning_steps_reward/mean": 0.8125000149011612,
      "rewards/reasoning_steps_reward/std": 0.2908661887049675,
      "rewards/tag_count_reward/mean": 0.95703125,
      "rewards/tag_count_reward/std": 0.14216844737529755,
      "step": 101
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 833.0,
      "completions/max_terminated_length": 772.25,
      "completions/mean_length": 542.375,
      "completions/mean_terminated_length": 528.0625,
      "completions/min_length": 267.75,
      "completions/min_terminated_length": 267.75,
      "epoch": 0.034,
      "grad_norm": 0.7064374089241028,
      "kl": 0.05255126953125,
      "learning_rate": 6.733333333333334e-06,
      "loss": 0.0097,
      "num_tokens": 5055558.0,
      "reward": 0.9852046072483063,
      "reward_std": 0.20568905398249626,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.21039126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.032470230013132095,
      "rewards/penalized_accuracy_reward/std": 0.12988092005252838,
      "rewards/reasoning_steps_reward/mean": 0.96875,
      "rewards/reasoning_steps_reward/std": 0.10782546922564507,
      "rewards/tag_count_reward/mean": 0.93359375,
      "rewards/tag_count_reward/std": 0.18054034188389778,
      "step": 102
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 952.0,
      "completions/max_terminated_length": 939.5,
      "completions/mean_length": 630.1875,
      "completions/mean_terminated_length": 595.8125,
      "completions/min_length": 232.25,
      "completions/min_terminated_length": 232.25,
      "epoch": 0.034333333333333334,
      "grad_norm": 0.6197292804718018,
      "kl": 0.0474853515625,
      "learning_rate": 6.800000000000001e-06,
      "loss": -0.0475,
      "num_tokens": 5107490.0,
      "reward": 0.9695312529802322,
      "reward_std": 0.20866616070270538,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.18217839300632477,
      "rewards/penalized_accuracy_reward/mean": 0.02734375,
      "rewards/penalized_accuracy_reward/std": 0.1093750074505806,
      "rewards/reasoning_steps_reward/mean": 0.9687500149011612,
      "rewards/reasoning_steps_reward/std": 0.10782546550035477,
      "rewards/tag_count_reward/mean": 0.953125,
      "rewards/tag_count_reward/std": 0.11353103816509247,
      "step": 103
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 818.5,
      "completions/max_terminated_length": 762.5,
      "completions/mean_length": 550.09375,
      "completions/mean_terminated_length": 541.9562530517578,
      "completions/min_length": 324.0,
      "completions/min_terminated_length": 324.0,
      "epoch": 0.034666666666666665,
      "grad_norm": 0.5912279486656189,
      "kl": 0.05926513671875,
      "learning_rate": 6.866666666666667e-06,
      "loss": -0.0204,
      "num_tokens": 5151560.0,
      "reward": 1.1052062809467316,
      "reward_std": 0.4189284183084965,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.08539126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.13932083547115326,
      "rewards/penalized_accuracy_reward/std": 0.38097113370895386,
      "rewards/reasoning_steps_reward/mean": 0.9583333432674408,
      "rewards/reasoning_steps_reward/std": 0.09622505307197571,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.021347815170884132,
      "step": 104
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 923.75,
      "completions/max_terminated_length": 917.75,
      "completions/mean_length": 665.375,
      "completions/mean_terminated_length": 657.4486694335938,
      "completions/min_length": 394.75,
      "completions/min_terminated_length": 394.75,
      "epoch": 0.035,
      "grad_norm": 0.6213710308074951,
      "kl": 0.049072265625,
      "learning_rate": 6.9333333333333344e-06,
      "loss": 0.0131,
      "num_tokens": 5204944.0,
      "reward": 0.9998129159212112,
      "reward_std": 0.159408881329,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.10077822208404541,
      "rewards/penalized_accuracy_reward/mean": 0.024943124502897263,
      "rewards/penalized_accuracy_reward/std": 0.09977250546216965,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.025194555521011353,
      "step": 105
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 890.0,
      "completions/max_terminated_length": 890.0,
      "completions/mean_length": 604.21875,
      "completions/mean_terminated_length": 604.21875,
      "completions/min_length": 247.0,
      "completions/min_terminated_length": 247.0,
      "epoch": 0.035333333333333335,
      "grad_norm": 0.749445915222168,
      "kl": 0.05389404296875,
      "learning_rate": 7e-06,
      "loss": -0.1121,
      "num_tokens": 5254142.0,
      "reward": 1.0192549675703049,
      "reward_std": 0.2774948216974735,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.060921624302864075,
      "rewards/penalized_accuracy_reward/std": 0.2436865046620369,
      "rewards/reasoning_steps_reward/mean": 0.9479166865348816,
      "rewards/reasoning_steps_reward/std": 0.14323293417692184,
      "rewards/tag_count_reward/mean": 0.96875,
      "rewards/tag_count_reward/std": 0.125,
      "step": 106
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 948.75,
      "completions/max_terminated_length": 925.75,
      "completions/mean_length": 705.203125,
      "completions/mean_terminated_length": 693.4765014648438,
      "completions/min_length": 479.5,
      "completions/min_terminated_length": 479.5,
      "epoch": 0.035666666666666666,
      "grad_norm": 0.6594750285148621,
      "kl": 0.05364990234375,
      "learning_rate": 7.066666666666667e-06,
      "loss": 0.0348,
      "num_tokens": 5308507.0,
      "reward": 1.0982975512742996,
      "reward_std": 0.3949567638337612,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.2257782220840454,
      "rewards/penalized_accuracy_reward/mean": 0.14165689051151276,
      "rewards/penalized_accuracy_reward/std": 0.31518884748220444,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.0624999962747097,
      "rewards/tag_count_reward/mean": 0.95703125,
      "rewards/tag_count_reward/std": 0.14216844737529755,
      "step": 107
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 938.75,
      "completions/max_terminated_length": 919.0,
      "completions/mean_length": 693.21875,
      "completions/mean_terminated_length": 689.9916839599609,
      "completions/min_length": 470.5,
      "completions/min_terminated_length": 470.5,
      "epoch": 0.036,
      "grad_norm": 0.6489946842193604,
      "kl": 0.0594482421875,
      "learning_rate": 7.133333333333334e-06,
      "loss": 0.0069,
      "num_tokens": 5360985.0,
      "reward": 0.936531126499176,
      "reward_std": 0.27230495028197765,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.34944770485162735,
      "rewards/penalized_accuracy_reward/mean": 0.03379674255847931,
      "rewards/penalized_accuracy_reward/std": 0.13518697023391724,
      "rewards/reasoning_steps_reward/mean": 0.9375000149011612,
      "rewards/reasoning_steps_reward/std": 0.152117520570755,
      "rewards/tag_count_reward/mean": 0.90234375,
      "rewards/tag_count_reward/std": 0.26711349189281464,
      "step": 108
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 747.75,
      "completions/max_terminated_length": 729.0,
      "completions/mean_length": 517.265625,
      "completions/mean_terminated_length": 512.7416687011719,
      "completions/min_length": 343.75,
      "completions/min_terminated_length": 343.75,
      "epoch": 0.036333333333333336,
      "grad_norm": 0.7832958102226257,
      "kl": 0.0709228515625,
      "learning_rate": 7.2000000000000005e-06,
      "loss": 0.008,
      "num_tokens": 5402474.0,
      "reward": 1.364576980471611,
      "reward_std": 0.522902300581336,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.21039126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.4221290349960327,
      "rewards/penalized_accuracy_reward/std": 0.47028493881225586,
      "rewards/reasoning_steps_reward/mean": 0.9427083432674408,
      "rewards/reasoning_steps_reward/std": 0.11986106634140015,
      "rewards/tag_count_reward/mean": 0.9609375,
      "rewards/tag_count_reward/std": 0.14152991026639938,
      "step": 109
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 924.0,
      "completions/max_terminated_length": 888.25,
      "completions/mean_length": 714.40625,
      "completions/mean_terminated_length": 688.4537963867188,
      "completions/min_length": 413.75,
      "completions/min_terminated_length": 413.75,
      "epoch": 0.03666666666666667,
      "grad_norm": 0.7029095888137817,
      "kl": 0.05279541015625,
      "learning_rate": 7.266666666666668e-06,
      "loss": 0.0519,
      "num_tokens": 5461796.0,
      "reward": 0.9207031279802322,
      "reward_std": 0.1675838977098465,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.33539126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.04929708316922188,
      "rewards/tag_count_reward/mean": 0.91015625,
      "rewards/tag_count_reward/std": 0.22450439631938934,
      "step": 110
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 986.0,
      "completions/max_terminated_length": 947.25,
      "completions/mean_length": 736.84375,
      "completions/mean_terminated_length": 703.7214202880859,
      "completions/min_length": 446.0,
      "completions/min_terminated_length": 446.0,
      "epoch": 0.037,
      "grad_norm": 0.6748602986335754,
      "kl": 0.05230712890625,
      "learning_rate": 7.333333333333333e-06,
      "loss": 0.0767,
      "num_tokens": 5517274.0,
      "reward": 0.9266927242279053,
      "reward_std": 0.15901001170277596,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.3300696536898613,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.0833333320915699,
      "rewards/tag_count_reward/mean": 0.93359375,
      "rewards/tag_count_reward/std": 0.1924012266099453,
      "step": 111
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 940.75,
      "completions/max_terminated_length": 896.75,
      "completions/mean_length": 634.515625,
      "completions/mean_terminated_length": 627.9427185058594,
      "completions/min_length": 337.75,
      "completions/min_terminated_length": 337.75,
      "epoch": 0.037333333333333336,
      "grad_norm": 0.7816139459609985,
      "kl": 0.06292724609375,
      "learning_rate": 7.4e-06,
      "loss": -0.05,
      "num_tokens": 5566667.0,
      "reward": 1.1322840005159378,
      "reward_std": 0.37390279583632946,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.1632782220840454,
      "rewards/penalized_accuracy_reward/mean": 0.18606003746390343,
      "rewards/penalized_accuracy_reward/std": 0.2862424701452255,
      "rewards/reasoning_steps_reward/mean": 0.9479166716337204,
      "rewards/reasoning_steps_reward/std": 0.14323293790221214,
      "rewards/tag_count_reward/mean": 0.97265625,
      "rewards/tag_count_reward/std": 0.07966844737529755,
      "step": 112
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 801.75,
      "completions/max_terminated_length": 768.75,
      "completions/mean_length": 557.125,
      "completions/mean_terminated_length": 550.9229278564453,
      "completions/min_length": 282.0,
      "completions/min_terminated_length": 282.0,
      "epoch": 0.03766666666666667,
      "grad_norm": 0.710523247718811,
      "kl": 0.08251953125,
      "learning_rate": 7.4666666666666675e-06,
      "loss": 0.0007,
      "num_tokens": 5611475.0,
      "reward": 1.057383418083191,
      "reward_std": 0.24412458762526512,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.08539126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.08485737442970276,
      "rewards/penalized_accuracy_reward/std": 0.1838604211807251,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.09096375107765198,
      "rewards/tag_count_reward/mean": 0.98046875,
      "rewards/tag_count_reward/std": 0.06340491026639938,
      "step": 113
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1000.5,
      "completions/max_terminated_length": 932.5,
      "completions/mean_length": 744.984375,
      "completions/mean_terminated_length": 718.1194305419922,
      "completions/min_length": 399.75,
      "completions/min_terminated_length": 399.75,
      "epoch": 0.038,
      "grad_norm": 0.7475992441177368,
      "kl": 0.06201171875,
      "learning_rate": 7.533333333333334e-06,
      "loss": 0.0666,
      "num_tokens": 5670786.0,
      "reward": 0.993861049413681,
      "reward_std": 0.2636701911687851,
      "rewards/format_reward/mean": 0.875,
      "rewards/format_reward/std": 0.2750816270709038,
      "rewards/penalized_accuracy_reward/mean": 0.05232458561658859,
      "rewards/penalized_accuracy_reward/std": 0.14314451813697815,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.94140625,
      "rewards/tag_count_reward/std": 0.13477232307195663,
      "step": 114
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 977.0,
      "completions/max_terminated_length": 938.75,
      "completions/mean_length": 711.25,
      "completions/mean_terminated_length": 691.9525451660156,
      "completions/min_length": 438.5,
      "completions/min_terminated_length": 438.5,
      "epoch": 0.03833333333333333,
      "grad_norm": 0.7733854651451111,
      "kl": 0.0596923828125,
      "learning_rate": 7.600000000000001e-06,
      "loss": 0.0926,
      "num_tokens": 5730226.0,
      "reward": 0.960807278752327,
      "reward_std": 0.12203849479556084,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.27289126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.97265625,
      "rewards/tag_count_reward/std": 0.08659191615879536,
      "step": 115
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 929.0,
      "completions/max_terminated_length": 911.25,
      "completions/mean_length": 694.640625,
      "completions/mean_terminated_length": 666.6000061035156,
      "completions/min_length": 331.5,
      "completions/min_terminated_length": 331.5,
      "epoch": 0.03866666666666667,
      "grad_norm": 0.5846694111824036,
      "kl": 0.06732177734375,
      "learning_rate": 7.666666666666667e-06,
      "loss": -0.0038,
      "num_tokens": 5783451.0,
      "reward": 0.9777296334505081,
      "reward_std": 0.20587004628032446,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.1280868947505951,
      "rewards/penalized_accuracy_reward/mean": 0.03437023237347603,
      "rewards/penalized_accuracy_reward/std": 0.1374809294939041,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.04929708316922188,
      "rewards/tag_count_reward/mean": 0.94921875,
      "rewards/tag_count_reward/std": 0.0654262900352478,
      "step": 116
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 950.25,
      "completions/max_terminated_length": 907.75,
      "completions/mean_length": 673.171875,
      "completions/mean_terminated_length": 658.1291809082031,
      "completions/min_length": 412.25,
      "completions/min_terminated_length": 412.25,
      "epoch": 0.039,
      "grad_norm": 0.8333153128623962,
      "kl": 0.071044921875,
      "learning_rate": 7.733333333333334e-06,
      "loss": -0.0128,
      "num_tokens": 5837190.0,
      "reward": 1.3094747960567474,
      "reward_std": 0.6067601572722197,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.1875,
      "rewards/penalized_accuracy_reward/mean": 0.3357768412679434,
      "rewards/penalized_accuracy_reward/std": 0.5968192145228386,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.9765625,
      "rewards/tag_count_reward/std": 0.08086910098791122,
      "step": 117
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 906.25,
      "completions/max_terminated_length": 906.25,
      "completions/mean_length": 637.984375,
      "completions/mean_terminated_length": 637.984375,
      "completions/min_length": 362.75,
      "completions/min_terminated_length": 362.75,
      "epoch": 0.03933333333333333,
      "grad_norm": 0.3177330195903778,
      "kl": 0.0679931640625,
      "learning_rate": 7.800000000000002e-06,
      "loss": -0.0297,
      "num_tokens": 5889061.0,
      "reward": 0.9921875,
      "reward_std": 0.022662729024887085,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.045325469225645065,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 118
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 960.75,
      "completions/max_terminated_length": 933.5,
      "completions/mean_length": 682.9375,
      "completions/mean_terminated_length": 671.2948150634766,
      "completions/min_length": 391.5,
      "completions/min_terminated_length": 391.5,
      "epoch": 0.03966666666666667,
      "grad_norm": 0.8171834945678711,
      "kl": 0.06939697265625,
      "learning_rate": 7.866666666666667e-06,
      "loss": 0.0615,
      "num_tokens": 5943329.0,
      "reward": 1.051426261663437,
      "reward_std": 0.2572258897125721,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.21039126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.07798877358436584,
      "rewards/penalized_accuracy_reward/std": 0.1678096055984497,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.05259781517088413,
      "step": 119
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 819.5,
      "completions/max_terminated_length": 819.5,
      "completions/mean_length": 581.84375,
      "completions/mean_terminated_length": 581.84375,
      "completions/min_length": 377.5,
      "completions/min_terminated_length": 377.5,
      "epoch": 0.04,
      "grad_norm": 0.6569817662239075,
      "kl": 0.07470703125,
      "learning_rate": 7.933333333333334e-06,
      "loss": -0.0118,
      "num_tokens": 5989639.0,
      "reward": 1.2491122335195541,
      "reward_std": 0.22335275262594223,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.14789126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.280622661113739,
      "rewards/penalized_accuracy_reward/std": 0.17314550280570984,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.0833333320915699,
      "rewards/tag_count_reward/mean": 0.9765625,
      "rewards/tag_count_reward/std": 0.06798820197582245,
      "step": 120
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 892.25,
      "completions/max_terminated_length": 800.5,
      "completions/mean_length": 574.3125,
      "completions/mean_terminated_length": 553.1169738769531,
      "completions/min_length": 312.25,
      "completions/min_terminated_length": 312.25,
      "epoch": 0.04033333333333333,
      "grad_norm": 0.6567302346229553,
      "kl": 0.078125,
      "learning_rate": 8.000000000000001e-06,
      "loss": 0.1119,
      "num_tokens": 6033883.0,
      "reward": 0.9722656160593033,
      "reward_std": 0.07208464667201042,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.1632782220840454,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.97265625,
      "rewards/tag_count_reward/std": 0.07966844737529755,
      "step": 121
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 739.25,
      "completions/max_terminated_length": 739.25,
      "completions/mean_length": 541.765625,
      "completions/mean_terminated_length": 541.765625,
      "completions/min_length": 366.25,
      "completions/min_terminated_length": 366.25,
      "epoch": 0.04066666666666666,
      "grad_norm": 0.8157733678817749,
      "kl": 0.080810546875,
      "learning_rate": 8.066666666666667e-06,
      "loss": -0.0225,
      "num_tokens": 6079660.0,
      "reward": 1.0707407891750336,
      "reward_std": 0.38787663727998734,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.1632782220840454,
      "rewards/penalized_accuracy_reward/mean": 0.10589701682329178,
      "rewards/penalized_accuracy_reward/std": 0.33678513765335083,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.0624999962747097,
      "rewards/tag_count_reward/mean": 0.9765625,
      "rewards/tag_count_reward/std": 0.06520001962780952,
      "step": 122
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 963.5,
      "completions/max_terminated_length": 961.25,
      "completions/mean_length": 680.15625,
      "completions/mean_terminated_length": 674.4635467529297,
      "completions/min_length": 405.75,
      "completions/min_terminated_length": 405.75,
      "epoch": 0.041,
      "grad_norm": 0.7646713256835938,
      "kl": 0.06640625,
      "learning_rate": 8.133333333333334e-06,
      "loss": 0.0321,
      "num_tokens": 6133430.0,
      "reward": 0.9917968809604645,
      "reward_std": 0.19527434464544058,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.1875,
      "rewards/penalized_accuracy_reward/mean": 0.02734375,
      "rewards/penalized_accuracy_reward/std": 0.1093750074505806,
      "rewards/reasoning_steps_reward/mean": 0.9687500149011612,
      "rewards/reasoning_steps_reward/std": 0.11179708316922188,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 123
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 862.25,
      "completions/max_terminated_length": 862.25,
      "completions/mean_length": 613.53125,
      "completions/mean_terminated_length": 613.53125,
      "completions/min_length": 426.0,
      "completions/min_terminated_length": 426.0,
      "epoch": 0.04133333333333333,
      "grad_norm": 0.5323629379272461,
      "kl": 0.0792236328125,
      "learning_rate": 8.2e-06,
      "loss": 0.0065,
      "num_tokens": 6182744.0,
      "reward": 1.3273025453090668,
      "reward_std": 0.2767297988757491,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.33771923184394836,
      "rewards/penalized_accuracy_reward/std": 0.2713307738304138,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.0833333320915699,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 124
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 957.0,
      "completions/max_terminated_length": 904.25,
      "completions/mean_length": 646.84375,
      "completions/mean_terminated_length": 636.4083557128906,
      "completions/min_length": 406.5,
      "completions/min_terminated_length": 406.5,
      "epoch": 0.041666666666666664,
      "grad_norm": 0.6856974363327026,
      "kl": 0.06842041015625,
      "learning_rate": 8.266666666666667e-06,
      "loss": 0.0573,
      "num_tokens": 6234014.0,
      "reward": 1.0164173543453217,
      "reward_std": 0.20691745728254318,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.037511106580495834,
      "rewards/penalized_accuracy_reward/std": 0.15004444122314453,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.045325469225645065,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 125
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 883.5,
      "completions/max_terminated_length": 883.5,
      "completions/mean_length": 606.40625,
      "completions/mean_terminated_length": 606.40625,
      "completions/min_length": 335.75,
      "completions/min_terminated_length": 335.75,
      "epoch": 0.042,
      "grad_norm": 0.5305747985839844,
      "kl": 0.0701904296875,
      "learning_rate": 8.333333333333334e-06,
      "loss": -0.0483,
      "num_tokens": 6281528.0,
      "reward": 0.9869791865348816,
      "reward_std": 0.04548186343163252,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.973958358168602,
      "rewards/reasoning_steps_reward/std": 0.09096374735236168,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 126
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 846.25,
      "completions/max_terminated_length": 846.25,
      "completions/mean_length": 568.875,
      "completions/mean_terminated_length": 568.875,
      "completions/min_length": 368.75,
      "completions/min_terminated_length": 368.75,
      "epoch": 0.042333333333333334,
      "grad_norm": 0.6627109050750732,
      "kl": 0.0797119140625,
      "learning_rate": 8.400000000000001e-06,
      "loss": 0.0405,
      "num_tokens": 6326704.0,
      "reward": 0.9777343720197678,
      "reward_std": 0.0601552352309227,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.96875,
      "rewards/reasoning_steps_reward/std": 0.06718547642230988,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 127
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 594.25,
      "completions/max_terminated_length": 594.25,
      "completions/mean_length": 446.21875,
      "completions/mean_terminated_length": 446.21875,
      "completions/min_length": 289.5,
      "completions/min_terminated_length": 289.5,
      "epoch": 0.042666666666666665,
      "grad_norm": 0.861020028591156,
      "kl": 0.0860595703125,
      "learning_rate": 8.466666666666668e-06,
      "loss": -0.0615,
      "num_tokens": 6365326.0,
      "reward": 1.0522719621658325,
      "reward_std": 0.27669200487434864,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.09133441746234894,
      "rewards/penalized_accuracy_reward/std": 0.1965949833393097,
      "rewards/reasoning_steps_reward/mean": 0.9218750149011612,
      "rewards/reasoning_steps_reward/std": 0.19525551423430443,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 128
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 690.0,
      "completions/max_terminated_length": 690.0,
      "completions/mean_length": 472.890625,
      "completions/mean_terminated_length": 472.890625,
      "completions/min_length": 295.25,
      "completions/min_terminated_length": 295.25,
      "epoch": 0.043,
      "grad_norm": 0.8932364583015442,
      "kl": 0.0789794921875,
      "learning_rate": 8.533333333333335e-06,
      "loss": -0.0626,
      "num_tokens": 6404775.0,
      "reward": 0.9579332917928696,
      "reward_std": 0.20159808173775673,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.030849946662783623,
      "rewards/penalized_accuracy_reward/std": 0.12339979410171509,
      "rewards/reasoning_steps_reward/mean": 0.8541666865348816,
      "rewards/reasoning_steps_reward/std": 0.1918465420603752,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 129
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 752.0,
      "completions/max_terminated_length": 752.0,
      "completions/mean_length": 564.015625,
      "completions/mean_terminated_length": 564.015625,
      "completions/min_length": 355.75,
      "completions/min_terminated_length": 355.75,
      "epoch": 0.043333333333333335,
      "grad_norm": 0.5215252637863159,
      "kl": 0.0770263671875,
      "learning_rate": 8.6e-06,
      "loss": 0.011,
      "num_tokens": 6450040.0,
      "reward": 1.2660482078790665,
      "reward_std": 0.4907595328986645,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.2738606929779053,
      "rewards/penalized_accuracy_reward/std": 0.47496628761291504,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.04929708316922188,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 130
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 862.75,
      "completions/max_terminated_length": 862.75,
      "completions/mean_length": 583.265625,
      "completions/mean_terminated_length": 583.265625,
      "completions/min_length": 387.75,
      "completions/min_terminated_length": 387.75,
      "epoch": 0.043666666666666666,
      "grad_norm": 0.5661642551422119,
      "kl": 0.06927490234375,
      "learning_rate": 8.666666666666668e-06,
      "loss": -0.044,
      "num_tokens": 6495593.0,
      "reward": 0.9751302152872086,
      "reward_std": 0.06074373424053192,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9635416865348816,
      "rewards/reasoning_steps_reward/std": 0.08656488358974457,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 131
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 865.0,
      "completions/max_terminated_length": 796.25,
      "completions/mean_length": 612.59375,
      "completions/mean_terminated_length": 601.3906402587891,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 0.044,
      "grad_norm": 0.7252206802368164,
      "kl": 0.07562255859375,
      "learning_rate": 8.733333333333333e-06,
      "loss": 0.0371,
      "num_tokens": 6544847.0,
      "reward": 0.9873867779970169,
      "reward_std": 0.23923740535974503,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.14789126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.035303402692079544,
      "rewards/penalized_accuracy_reward/std": 0.14121361076831818,
      "rewards/reasoning_steps_reward/mean": 0.9479167014360428,
      "rewards/reasoning_steps_reward/std": 0.14964327588677406,
      "rewards/tag_count_reward/mean": 0.96875,
      "rewards/tag_count_reward/std": 0.09120866656303406,
      "step": 132
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 813.75,
      "completions/max_terminated_length": 813.75,
      "completions/mean_length": 564.140625,
      "completions/mean_terminated_length": 564.140625,
      "completions/min_length": 386.5,
      "completions/min_terminated_length": 386.5,
      "epoch": 0.044333333333333336,
      "grad_norm": 0.6854252219200134,
      "kl": 0.07403564453125,
      "learning_rate": 8.8e-06,
      "loss": -0.0208,
      "num_tokens": 6590808.0,
      "reward": 0.9540364742279053,
      "reward_std": 0.0992764113470912,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.14789126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9479166865348816,
      "rewards/reasoning_steps_reward/std": 0.12311986833810806,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.03697281517088413,
      "step": 133
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 880.0,
      "completions/max_terminated_length": 772.25,
      "completions/mean_length": 629.671875,
      "completions/mean_terminated_length": 604.9509124755859,
      "completions/min_length": 390.25,
      "completions/min_terminated_length": 390.25,
      "epoch": 0.04466666666666667,
      "grad_norm": 0.4922334849834442,
      "kl": 0.0748291015625,
      "learning_rate": 8.866666666666668e-06,
      "loss": 0.0967,
      "num_tokens": 6642387.0,
      "reward": 0.9710937589406967,
      "reward_std": 0.07905462384223938,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.17078252136707306,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.9609375,
      "rewards/tag_count_reward/std": 0.11245574057102203,
      "step": 134
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 744.5,
      "completions/max_terminated_length": 744.5,
      "completions/mean_length": 534.625,
      "completions/mean_terminated_length": 534.625,
      "completions/min_length": 347.25,
      "completions/min_terminated_length": 347.25,
      "epoch": 0.045,
      "grad_norm": 0.5973943471908569,
      "kl": 0.06695556640625,
      "learning_rate": 8.933333333333333e-06,
      "loss": -0.0403,
      "num_tokens": 6688251.0,
      "reward": 1.1759473234415054,
      "reward_std": 0.27473309077322483,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.1907910406589508,
      "rewards/penalized_accuracy_reward/std": 0.22456614673137665,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.0624999962747097,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 135
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 871.25,
      "completions/max_terminated_length": 871.25,
      "completions/mean_length": 545.59375,
      "completions/mean_terminated_length": 545.59375,
      "completions/min_length": 346.75,
      "completions/min_terminated_length": 346.75,
      "epoch": 0.04533333333333334,
      "grad_norm": 0.7356082201004028,
      "kl": 0.06689453125,
      "learning_rate": 9e-06,
      "loss": 0.0034,
      "num_tokens": 6731681.0,
      "reward": 0.9843750298023224,
      "reward_std": 0.06249998975545168,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9687500149011612,
      "rewards/reasoning_steps_reward/std": 0.1249999962747097,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 136
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 789.25,
      "completions/max_terminated_length": 789.25,
      "completions/mean_length": 548.453125,
      "completions/mean_terminated_length": 548.453125,
      "completions/min_length": 353.5,
      "completions/min_terminated_length": 353.5,
      "epoch": 0.04566666666666667,
      "grad_norm": 0.5256035327911377,
      "kl": 0.0791015625,
      "learning_rate": 9.066666666666667e-06,
      "loss": -0.0046,
      "num_tokens": 6777742.0,
      "reward": 1.1897451877593994,
      "reward_std": 0.28156070224940777,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.20016179978847504,
      "rewards/penalized_accuracy_reward/std": 0.26698175072669983,
      "rewards/reasoning_steps_reward/mean": 0.9791666865348816,
      "rewards/reasoning_steps_reward/std": 0.0833333283662796,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 137
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 826.75,
      "completions/max_terminated_length": 797.5,
      "completions/mean_length": 540.46875,
      "completions/mean_terminated_length": 534.0687561035156,
      "completions/min_length": 384.75,
      "completions/min_terminated_length": 384.75,
      "epoch": 0.046,
      "grad_norm": 0.6895825266838074,
      "kl": 0.072998046875,
      "learning_rate": 9.133333333333335e-06,
      "loss": 0.0353,
      "num_tokens": 6824652.0,
      "reward": 1.023446962237358,
      "reward_std": 0.2008255310356617,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.03790007159113884,
      "rewards/penalized_accuracy_reward/std": 0.15160028636455536,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.045325469225645065,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 138
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 903.25,
      "completions/max_terminated_length": 903.25,
      "completions/mean_length": 568.640625,
      "completions/mean_terminated_length": 568.640625,
      "completions/min_length": 350.0,
      "completions/min_terminated_length": 350.0,
      "epoch": 0.04633333333333333,
      "grad_norm": 0.7145423293113708,
      "kl": 0.0782470703125,
      "learning_rate": 9.200000000000002e-06,
      "loss": -0.0324,
      "num_tokens": 6872709.0,
      "reward": 1.254620909690857,
      "reward_std": 0.4884070521220565,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.26503754034638405,
      "rewards/penalized_accuracy_reward/std": 0.455327644944191,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.06615880131721497,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 139
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 831.75,
      "completions/max_terminated_length": 798.25,
      "completions/mean_length": 577.6875,
      "completions/mean_terminated_length": 572.1739654541016,
      "completions/min_length": 370.25,
      "completions/min_terminated_length": 370.25,
      "epoch": 0.04666666666666667,
      "grad_norm": 0.4422559142112732,
      "kl": 0.0770263671875,
      "learning_rate": 9.266666666666667e-06,
      "loss": 0.0223,
      "num_tokens": 6917809.0,
      "reward": 1.1303823590278625,
      "reward_std": 0.2677050596103072,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.14040839672088623,
      "rewards/penalized_accuracy_reward/std": 0.2511829137802124,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 140
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 825.5,
      "completions/max_terminated_length": 783.0,
      "completions/mean_length": 578.609375,
      "completions/mean_terminated_length": 572.3770904541016,
      "completions/min_length": 362.0,
      "completions/min_terminated_length": 362.0,
      "epoch": 0.047,
      "grad_norm": 0.41671282052993774,
      "kl": 0.0897216796875,
      "learning_rate": 9.333333333333334e-06,
      "loss": -0.0063,
      "num_tokens": 6962376.0,
      "reward": 1.0969283878803253,
      "reward_std": 0.33608537912368774,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.10356900468468666,
      "rewards/penalized_accuracy_reward/std": 0.3315762132406235,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 141
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 709.75,
      "completions/max_terminated_length": 709.75,
      "completions/mean_length": 574.21875,
      "completions/mean_terminated_length": 574.21875,
      "completions/min_length": 432.25,
      "completions/min_terminated_length": 432.25,
      "epoch": 0.04733333333333333,
      "grad_norm": 0.5335647463798523,
      "kl": 0.08154296875,
      "learning_rate": 9.4e-06,
      "loss": 0.0193,
      "num_tokens": 7009670.0,
      "reward": 1.3210109174251556,
      "reward_std": 0.4340841621160507,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.32882341742515564,
      "rewards/penalized_accuracy_reward/std": 0.43917667865753174,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.0625,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 142
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 937.5,
      "completions/max_terminated_length": 927.75,
      "completions/mean_length": 658.375,
      "completions/mean_terminated_length": 642.2320098876953,
      "completions/min_length": 405.25,
      "completions/min_terminated_length": 405.25,
      "epoch": 0.04766666666666667,
      "grad_norm": 0.7152761220932007,
      "kl": 0.08819580078125,
      "learning_rate": 9.466666666666667e-06,
      "loss": 0.0784,
      "num_tokens": 7063806.0,
      "reward": 1.112352579832077,
      "reward_std": 0.4564479161053896,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.21039126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.14047756046056747,
      "rewards/penalized_accuracy_reward/std": 0.3838609904050827,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.96875,
      "rewards/tag_count_reward/std": 0.11091229319572449,
      "step": 143
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 804.5,
      "completions/max_terminated_length": 804.5,
      "completions/mean_length": 592.171875,
      "completions/mean_terminated_length": 592.171875,
      "completions/min_length": 403.75,
      "completions/min_terminated_length": 403.75,
      "epoch": 0.048,
      "grad_norm": 0.42576244473457336,
      "kl": 0.0836181640625,
      "learning_rate": 9.533333333333334e-06,
      "loss": -0.007,
      "num_tokens": 7112569.0,
      "reward": 1.0555336475372314,
      "reward_std": 0.17897546291351318,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.06334614008665085,
      "rewards/penalized_accuracy_reward/std": 0.17320473492145538,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.0625,
      "step": 144
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 916.5,
      "completions/max_terminated_length": 843.25,
      "completions/mean_length": 615.765625,
      "completions/mean_terminated_length": 609.4593811035156,
      "completions/min_length": 397.0,
      "completions/min_terminated_length": 397.0,
      "epoch": 0.04833333333333333,
      "grad_norm": 0.7763967514038086,
      "kl": 0.085205078125,
      "learning_rate": 9.600000000000001e-06,
      "loss": 0.0302,
      "num_tokens": 7162346.0,
      "reward": 1.097764641046524,
      "reward_std": 0.3960863724350929,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.1875,
      "rewards/penalized_accuracy_reward/mean": 0.12289484962821007,
      "rewards/penalized_accuracy_reward/std": 0.3364574760198593,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 145
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 866.25,
      "completions/max_terminated_length": 848.0,
      "completions/mean_length": 651.59375,
      "completions/mean_terminated_length": 629.2216033935547,
      "completions/min_length": 447.75,
      "completions/min_terminated_length": 447.75,
      "epoch": 0.048666666666666664,
      "grad_norm": 0.5924893617630005,
      "kl": 0.0850830078125,
      "learning_rate": 9.666666666666667e-06,
      "loss": 0.063,
      "num_tokens": 7213984.0,
      "reward": 1.452492356300354,
      "reward_std": 0.6358413472771645,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.1875,
      "rewards/penalized_accuracy_reward/mean": 0.500148594379425,
      "rewards/penalized_accuracy_reward/std": 0.5861188173294067,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.9609375,
      "rewards/tag_count_reward/std": 0.06404344737529755,
      "step": 146
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 646.75,
      "completions/max_terminated_length": 626.5,
      "completions/mean_length": 452.921875,
      "completions/mean_terminated_length": 441.7901916503906,
      "completions/min_length": 266.25,
      "completions/min_terminated_length": 266.25,
      "epoch": 0.049,
      "grad_norm": 0.8521579504013062,
      "kl": 0.09228515625,
      "learning_rate": 9.733333333333334e-06,
      "loss": -0.0441,
      "num_tokens": 7251755.0,
      "reward": 1.3368138670921326,
      "reward_std": 0.15262084361165762,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.2596946656703949,
      "rewards/penalized_accuracy_reward/mean": 0.4003555178642273,
      "rewards/penalized_accuracy_reward/std": 0.04466142877936363,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.0833333320915699,
      "rewards/tag_count_reward/mean": 0.90625,
      "rewards/tag_count_reward/std": 0.21579129993915558,
      "step": 147
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 695.0,
      "completions/max_terminated_length": 695.0,
      "completions/mean_length": 484.296875,
      "completions/mean_terminated_length": 484.296875,
      "completions/min_length": 300.0,
      "completions/min_terminated_length": 300.0,
      "epoch": 0.04933333333333333,
      "grad_norm": 0.8887056708335876,
      "kl": 0.096435546875,
      "learning_rate": 9.800000000000001e-06,
      "loss": -0.0271,
      "num_tokens": 7293534.0,
      "reward": 1.0694793164730072,
      "reward_std": 0.40229372307658195,
      "rewards/format_reward/mean": 0.796875,
      "rewards/format_reward/std": 0.4022643193602562,
      "rewards/penalized_accuracy_reward/mean": 0.16635430604219437,
      "rewards/penalized_accuracy_reward/std": 0.3324264883995056,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.84375,
      "rewards/tag_count_reward/std": 0.3176925200968981,
      "step": 148
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 859.5,
      "completions/max_terminated_length": 857.0,
      "completions/mean_length": 633.71875,
      "completions/mean_terminated_length": 630.0052185058594,
      "completions/min_length": 394.5,
      "completions/min_terminated_length": 394.5,
      "epoch": 0.049666666666666665,
      "grad_norm": 0.6912094354629517,
      "kl": 0.0787353515625,
      "learning_rate": 9.866666666666668e-06,
      "loss": 0.0167,
      "num_tokens": 7343500.0,
      "reward": 1.2271893173456192,
      "reward_std": 0.37537867948412895,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.24463719688355923,
      "rewards/penalized_accuracy_reward/std": 0.33784525841474533,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.06454972177743912,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 149
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 789.25,
      "completions/max_terminated_length": 743.25,
      "completions/mean_length": 593.515625,
      "completions/mean_terminated_length": 568.71875,
      "completions/min_length": 395.0,
      "completions/min_terminated_length": 395.0,
      "epoch": 0.05,
      "grad_norm": 0.7075642347335815,
      "kl": 0.091064453125,
      "learning_rate": 9.933333333333334e-06,
      "loss": 0.0403,
      "num_tokens": 7394333.0,
      "reward": 0.9887036979198456,
      "reward_std": 0.21000467520207167,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.17430340498685837,
      "rewards/penalized_accuracy_reward/mean": 0.028807848691940308,
      "rewards/penalized_accuracy_reward/std": 0.11523139476776123,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.9375,
      "rewards/tag_count_reward/std": 0.14635255187749863,
      "step": 150
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 751.0,
      "completions/max_terminated_length": 707.0,
      "completions/mean_length": 535.09375,
      "completions/mean_terminated_length": 528.0125122070312,
      "completions/min_length": 350.0,
      "completions/min_terminated_length": 350.0,
      "epoch": 0.050333333333333334,
      "grad_norm": 0.4760342538356781,
      "kl": 0.0927734375,
      "learning_rate": 1e-05,
      "loss": -0.0331,
      "num_tokens": 7438003.0,
      "reward": 0.9880945086479187,
      "reward_std": 0.20418935269117355,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.1971946656703949,
      "rewards/penalized_accuracy_reward/mean": 0.03457889333367348,
      "rewards/penalized_accuracy_reward/std": 0.1383155733346939,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.91015625,
      "rewards/tag_count_reward/std": 0.18738707154989243,
      "step": 151
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 794.5,
      "completions/max_terminated_length": 794.5,
      "completions/mean_length": 529.515625,
      "completions/mean_terminated_length": 529.515625,
      "completions/min_length": 351.5,
      "completions/min_terminated_length": 351.5,
      "epoch": 0.050666666666666665,
      "grad_norm": 0.6448789834976196,
      "kl": 0.101806640625,
      "learning_rate": 1.0066666666666666e-05,
      "loss": -0.0228,
      "num_tokens": 7481876.0,
      "reward": 0.9598958492279053,
      "reward_std": 0.10131500661373138,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.1875,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.9635416716337204,
      "rewards/reasoning_steps_reward/std": 0.11148427054286003,
      "rewards/tag_count_reward/mean": 0.96875,
      "rewards/tag_count_reward/std": 0.125,
      "step": 152
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 768.0,
      "completions/max_terminated_length": 768.0,
      "completions/mean_length": 542.40625,
      "completions/mean_terminated_length": 542.40625,
      "completions/min_length": 291.25,
      "completions/min_terminated_length": 291.25,
      "epoch": 0.051,
      "grad_norm": 0.6065596342086792,
      "kl": 0.10888671875,
      "learning_rate": 1.0133333333333335e-05,
      "loss": 0.0136,
      "num_tokens": 7529262.0,
      "reward": 1.0723292827606201,
      "reward_std": 0.26749104261398315,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.10357928276062012,
      "rewards/penalized_accuracy_reward/std": 0.2227180302143097,
      "rewards/reasoning_steps_reward/mean": 0.96875,
      "rewards/reasoning_steps_reward/std": 0.07375510036945343,
      "rewards/tag_count_reward/mean": 0.96875,
      "rewards/tag_count_reward/std": 0.125,
      "step": 153
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 819.5,
      "completions/max_terminated_length": 819.5,
      "completions/mean_length": 581.640625,
      "completions/mean_terminated_length": 581.640625,
      "completions/min_length": 388.25,
      "completions/min_terminated_length": 388.25,
      "epoch": 0.051333333333333335,
      "grad_norm": 0.7174891829490662,
      "kl": 0.0872802734375,
      "learning_rate": 1.02e-05,
      "loss": 0.0025,
      "num_tokens": 7576631.0,
      "reward": 1.0558475106954575,
      "reward_std": 0.27014252822846174,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.07329539954662323,
      "rewards/penalized_accuracy_reward/std": 0.20035086572170258,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.9765625,
      "rewards/tag_count_reward/std": 0.09375,
      "step": 154
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 809.25,
      "completions/max_terminated_length": 809.25,
      "completions/mean_length": 554.03125,
      "completions/mean_terminated_length": 554.03125,
      "completions/min_length": 333.75,
      "completions/min_terminated_length": 333.75,
      "epoch": 0.051666666666666666,
      "grad_norm": 0.5973610281944275,
      "kl": 0.0762939453125,
      "learning_rate": 1.0266666666666668e-05,
      "loss": 0.0161,
      "num_tokens": 7621753.0,
      "reward": 0.962109386920929,
      "reward_std": 0.04935498908162117,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9375000149011612,
      "rewards/reasoning_steps_reward/std": 0.08836335688829422,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 155
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 851.0,
      "completions/max_terminated_length": 851.0,
      "completions/mean_length": 557.0625,
      "completions/mean_terminated_length": 557.0625,
      "completions/min_length": 320.0,
      "completions/min_terminated_length": 320.0,
      "epoch": 0.052,
      "grad_norm": 0.5781278014183044,
      "kl": 0.09228515625,
      "learning_rate": 1.0333333333333335e-05,
      "loss": -0.0361,
      "num_tokens": 7667229.0,
      "reward": 0.9921875298023224,
      "reward_std": 0.03124998975545168,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.0624999962747097,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 156
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 715.0,
      "completions/max_terminated_length": 715.0,
      "completions/mean_length": 535.0,
      "completions/mean_terminated_length": 535.0,
      "completions/min_length": 312.5,
      "completions/min_terminated_length": 312.5,
      "epoch": 0.052333333333333336,
      "grad_norm": 0.08791633695363998,
      "kl": 0.1031494140625,
      "learning_rate": 1.04e-05,
      "loss": 0.0041,
      "num_tokens": 7711901.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 157
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 888.75,
      "completions/max_terminated_length": 888.75,
      "completions/mean_length": 569.859375,
      "completions/mean_terminated_length": 569.859375,
      "completions/min_length": 321.5,
      "completions/min_terminated_length": 321.5,
      "epoch": 0.05266666666666667,
      "grad_norm": 0.2468576729297638,
      "kl": 0.1019287109375,
      "learning_rate": 1.0466666666666668e-05,
      "loss": -0.0474,
      "num_tokens": 7759348.0,
      "reward": 0.9973958432674408,
      "reward_std": 0.010416663251817226,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 158
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 850.75,
      "completions/max_terminated_length": 827.0,
      "completions/mean_length": 564.75,
      "completions/mean_terminated_length": 560.0031280517578,
      "completions/min_length": 386.25,
      "completions/min_terminated_length": 386.25,
      "epoch": 0.053,
      "grad_norm": 0.7196468710899353,
      "kl": 0.0997314453125,
      "learning_rate": 1.0533333333333333e-05,
      "loss": 0.0337,
      "num_tokens": 7804964.0,
      "reward": 1.1889299154281616,
      "reward_std": 0.41827017441391945,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.21640385873615742,
      "rewards/penalized_accuracy_reward/std": 0.3672218695282936,
      "rewards/reasoning_steps_reward/mean": 0.9583333432674408,
      "rewards/reasoning_steps_reward/std": 0.10400499776005745,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 159
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 912.25,
      "completions/max_terminated_length": 889.25,
      "completions/mean_length": 659.796875,
      "completions/mean_terminated_length": 649.8364715576172,
      "completions/min_length": 370.75,
      "completions/min_terminated_length": 370.75,
      "epoch": 0.05333333333333334,
      "grad_norm": 0.7643465995788574,
      "kl": 0.0953369140625,
      "learning_rate": 1.0600000000000002e-05,
      "loss": 0.0674,
      "num_tokens": 7857383.0,
      "reward": 1.0303755104541779,
      "reward_std": 0.2476533642038703,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.1875,
      "rewards/penalized_accuracy_reward/mean": 0.06331821531057358,
      "rewards/penalized_accuracy_reward/std": 0.17318597435951233,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.08538305386900902,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 160
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 922.5,
      "completions/max_terminated_length": 915.25,
      "completions/mean_length": 688.265625,
      "completions/mean_terminated_length": 680.2254486083984,
      "completions/min_length": 373.75,
      "completions/min_terminated_length": 373.75,
      "epoch": 0.05366666666666667,
      "grad_norm": 0.6083821654319763,
      "kl": 0.08447265625,
      "learning_rate": 1.0666666666666667e-05,
      "loss": 0.0299,
      "num_tokens": 7913800.0,
      "reward": 1.0095978379249573,
      "reward_std": 0.12780765816569328,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.08539126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.022879095748066902,
      "rewards/penalized_accuracy_reward/std": 0.09151638299226761,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.021347815170884132,
      "step": 161
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 934.0,
      "completions/max_terminated_length": 927.75,
      "completions/mean_length": 665.5625,
      "completions/mean_terminated_length": 660.6781311035156,
      "completions/min_length": 413.5,
      "completions/min_terminated_length": 413.5,
      "epoch": 0.054,
      "grad_norm": 0.498530775308609,
      "kl": 0.084228515625,
      "learning_rate": 1.0733333333333333e-05,
      "loss": 0.0739,
      "num_tokens": 7967820.0,
      "reward": 1.0622639656066895,
      "reward_std": 0.3131758403033018,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0734618678689003,
      "rewards/penalized_accuracy_reward/std": 0.2938474714756012,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.9765625,
      "rewards/tag_count_reward/std": 0.06404344737529755,
      "step": 162
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 778.5,
      "completions/max_terminated_length": 777.75,
      "completions/mean_length": 593.96875,
      "completions/mean_terminated_length": 578.9427185058594,
      "completions/min_length": 289.5,
      "completions/min_terminated_length": 289.5,
      "epoch": 0.05433333333333333,
      "grad_norm": 0.9088695645332336,
      "kl": 0.102783203125,
      "learning_rate": 1.0800000000000002e-05,
      "loss": -0.0051,
      "num_tokens": 8018394.0,
      "reward": 1.031543791294098,
      "reward_std": 0.33813750743865967,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.11180340498685837,
      "rewards/penalized_accuracy_reward/mean": 0.09391359053552151,
      "rewards/penalized_accuracy_reward/std": 0.27939801663160324,
      "rewards/reasoning_steps_reward/mean": 0.9322916865348816,
      "rewards/reasoning_steps_reward/std": 0.12620654702186584,
      "rewards/tag_count_reward/mean": 0.96484375,
      "rewards/tag_count_reward/std": 0.08017472177743912,
      "step": 163
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 846.0,
      "completions/max_terminated_length": 846.0,
      "completions/mean_length": 629.40625,
      "completions/mean_terminated_length": 629.40625,
      "completions/min_length": 357.25,
      "completions/min_terminated_length": 357.25,
      "epoch": 0.05466666666666667,
      "grad_norm": 0.683199942111969,
      "kl": 0.095947265625,
      "learning_rate": 1.0866666666666667e-05,
      "loss": -0.0333,
      "num_tokens": 8068452.0,
      "reward": 0.9789062589406967,
      "reward_std": 0.08437499310821295,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.0624999962747097,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 164
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 783.75,
      "completions/max_terminated_length": 783.75,
      "completions/mean_length": 538.671875,
      "completions/mean_terminated_length": 538.671875,
      "completions/min_length": 324.75,
      "completions/min_terminated_length": 324.75,
      "epoch": 0.055,
      "grad_norm": 0.7674322128295898,
      "kl": 0.096435546875,
      "learning_rate": 1.0933333333333334e-05,
      "loss": 0.0147,
      "num_tokens": 8111759.0,
      "reward": 1.2586846053600311,
      "reward_std": 0.4014707673341036,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.08539126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.31063770316541195,
      "rewards/penalized_accuracy_reward/std": 0.3363153263926506,
      "rewards/reasoning_steps_reward/mean": 0.9218750149011612,
      "rewards/reasoning_steps_reward/std": 0.14865445718169212,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 165
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 911.25,
      "completions/max_terminated_length": 872.5,
      "completions/mean_length": 625.640625,
      "completions/mean_terminated_length": 619.5031280517578,
      "completions/min_length": 316.75,
      "completions/min_terminated_length": 316.75,
      "epoch": 0.05533333333333333,
      "grad_norm": 0.6314289569854736,
      "kl": 0.1151123046875,
      "learning_rate": 1.1000000000000001e-05,
      "loss": -0.0063,
      "num_tokens": 8161384.0,
      "reward": 0.9713541865348816,
      "reward_std": 0.06449455861002207,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.08539126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.08699213340878487,
      "rewards/tag_count_reward/mean": 0.96875,
      "rewards/tag_count_reward/std": 0.11027991026639938,
      "step": 166
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 760.25,
      "completions/max_terminated_length": 760.25,
      "completions/mean_length": 594.03125,
      "completions/mean_terminated_length": 594.03125,
      "completions/min_length": 415.75,
      "completions/min_terminated_length": 415.75,
      "epoch": 0.05566666666666667,
      "grad_norm": 0.7111170291900635,
      "kl": 0.1248779296875,
      "learning_rate": 1.1066666666666669e-05,
      "loss": -0.0209,
      "num_tokens": 8207802.0,
      "reward": 1.0302852392196655,
      "reward_std": 0.16280756704509258,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.035493556410074234,
      "rewards/penalized_accuracy_reward/std": 0.14197422564029694,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 167
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 898.5,
      "completions/max_terminated_length": 819.5,
      "completions/mean_length": 615.109375,
      "completions/mean_terminated_length": 602.5218963623047,
      "completions/min_length": 367.75,
      "completions/min_terminated_length": 367.75,
      "epoch": 0.056,
      "grad_norm": 0.48290061950683594,
      "kl": 0.0958251953125,
      "learning_rate": 1.1133333333333334e-05,
      "loss": 0.0452,
      "num_tokens": 8259409.0,
      "reward": 1.0043448507785797,
      "reward_std": 0.1297731138765812,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.023615676909685135,
      "rewards/penalized_accuracy_reward/std": 0.09446270018815994,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.028463751077651978,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.0625,
      "step": 168
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 981.25,
      "completions/max_terminated_length": 952.0,
      "completions/mean_length": 640.203125,
      "completions/mean_terminated_length": 626.9486694335938,
      "completions/min_length": 365.5,
      "completions/min_terminated_length": 365.5,
      "epoch": 0.05633333333333333,
      "grad_norm": 0.6626110076904297,
      "kl": 0.091552734375,
      "learning_rate": 1.1200000000000001e-05,
      "loss": 0.0242,
      "num_tokens": 8312334.0,
      "reward": 0.977734386920929,
      "reward_std": 0.06429007556289434,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.08539126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.04929708316922188,
      "rewards/tag_count_reward/mean": 0.98046875,
      "rewards/tag_count_reward/std": 0.06340491026639938,
      "step": 169
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 884.0,
      "completions/max_terminated_length": 873.75,
      "completions/mean_length": 599.890625,
      "completions/mean_terminated_length": 593.7854309082031,
      "completions/min_length": 362.75,
      "completions/min_terminated_length": 362.75,
      "epoch": 0.056666666666666664,
      "grad_norm": 0.8113217353820801,
      "kl": 0.1051025390625,
      "learning_rate": 1.1266666666666668e-05,
      "loss": 0.0321,
      "num_tokens": 8358455.0,
      "reward": 1.091463789343834,
      "reward_std": 0.2897885050624609,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.13078668527305126,
      "rewards/penalized_accuracy_reward/std": 0.23328303545713425,
      "rewards/reasoning_steps_reward/mean": 0.9479166865348816,
      "rewards/reasoning_steps_reward/std": 0.11515219509601593,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 170
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 816.0,
      "completions/max_terminated_length": 781.25,
      "completions/mean_length": 544.46875,
      "completions/mean_terminated_length": 538.4937591552734,
      "completions/min_length": 250.0,
      "completions/min_terminated_length": 250.0,
      "epoch": 0.057,
      "grad_norm": 0.773130476474762,
      "kl": 0.1029052734375,
      "learning_rate": 1.1333333333333334e-05,
      "loss": 0.0359,
      "num_tokens": 8403141.0,
      "reward": 1.2276104539632797,
      "reward_std": 0.38097991049289703,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.2628968879580498,
      "rewards/penalized_accuracy_reward/std": 0.33269084990024567,
      "rewards/reasoning_steps_reward/mean": 0.942708358168602,
      "rewards/reasoning_steps_reward/std": 0.15272368490695953,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 171
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 769.25,
      "completions/max_terminated_length": 769.25,
      "completions/mean_length": 547.65625,
      "completions/mean_terminated_length": 547.65625,
      "completions/min_length": 356.75,
      "completions/min_terminated_length": 356.75,
      "epoch": 0.05733333333333333,
      "grad_norm": 0.6168924570083618,
      "kl": 0.1142578125,
      "learning_rate": 1.14e-05,
      "loss": -0.0371,
      "num_tokens": 8448191.0,
      "reward": 0.9587239623069763,
      "reward_std": 0.09781504608690739,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9322916716337204,
      "rewards/reasoning_steps_reward/std": 0.13625510036945343,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 172
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 756.25,
      "completions/max_terminated_length": 756.25,
      "completions/mean_length": 538.265625,
      "completions/mean_terminated_length": 538.265625,
      "completions/min_length": 314.75,
      "completions/min_terminated_length": 314.75,
      "epoch": 0.057666666666666665,
      "grad_norm": 0.28591179847717285,
      "kl": 0.106689453125,
      "learning_rate": 1.1466666666666668e-05,
      "loss": -0.0214,
      "num_tokens": 8492928.0,
      "reward": 0.984375,
      "reward_std": 0.029949801042675972,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.96875,
      "rewards/reasoning_steps_reward/std": 0.05989960581064224,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 173
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 863.0,
      "completions/max_terminated_length": 863.0,
      "completions/mean_length": 600.078125,
      "completions/mean_terminated_length": 600.078125,
      "completions/min_length": 428.5,
      "completions/min_terminated_length": 428.5,
      "epoch": 0.058,
      "grad_norm": 0.6120237112045288,
      "kl": 0.1251220703125,
      "learning_rate": 1.1533333333333334e-05,
      "loss": 0.0012,
      "num_tokens": 8542149.0,
      "reward": 1.0353666841983795,
      "reward_std": 0.1445917427772656,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.035757310688495636,
      "rewards/penalized_accuracy_reward/std": 0.14302925765514374,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 174
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 778.0,
      "completions/max_terminated_length": 778.0,
      "completions/mean_length": 546.40625,
      "completions/mean_terminated_length": 546.40625,
      "completions/min_length": 326.5,
      "completions/min_terminated_length": 326.5,
      "epoch": 0.058333333333333334,
      "grad_norm": 0.8150026798248291,
      "kl": 0.107177734375,
      "learning_rate": 1.16e-05,
      "loss": 0.0531,
      "num_tokens": 8586303.0,
      "reward": 0.9785156399011612,
      "reward_std": 0.061021566041745245,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.045325469225645065,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 175
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 939.5,
      "completions/max_terminated_length": 900.25,
      "completions/mean_length": 603.34375,
      "completions/mean_terminated_length": 596.8812561035156,
      "completions/min_length": 323.75,
      "completions/min_terminated_length": 323.75,
      "epoch": 0.058666666666666666,
      "grad_norm": 0.5437741875648499,
      "kl": 0.104736328125,
      "learning_rate": 1.1666666666666668e-05,
      "loss": 0.0275,
      "num_tokens": 8636565.0,
      "reward": 1.141858160495758,
      "reward_std": 0.30250774696469307,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.15188416838645935,
      "rewards/penalized_accuracy_reward/std": 0.27170804142951965,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 176
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 859.75,
      "completions/max_terminated_length": 859.75,
      "completions/mean_length": 575.453125,
      "completions/mean_terminated_length": 575.453125,
      "completions/min_length": 334.25,
      "completions/min_terminated_length": 334.25,
      "epoch": 0.059,
      "grad_norm": 0.7538095116615295,
      "kl": 0.107421875,
      "learning_rate": 1.1733333333333335e-05,
      "loss": -0.0034,
      "num_tokens": 8680978.0,
      "reward": 1.022205427289009,
      "reward_std": 0.15210942446719855,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.031189795583486557,
      "rewards/penalized_accuracy_reward/std": 0.12475918233394623,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.045325469225645065,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 177
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 791.0,
      "completions/max_terminated_length": 791.0,
      "completions/mean_length": 577.6875,
      "completions/mean_terminated_length": 577.6875,
      "completions/min_length": 342.5,
      "completions/min_terminated_length": 342.5,
      "epoch": 0.059333333333333335,
      "grad_norm": 0.43398621678352356,
      "kl": 0.126708984375,
      "learning_rate": 1.18e-05,
      "loss": -0.0072,
      "num_tokens": 8726478.0,
      "reward": 0.9947916716337204,
      "reward_std": 0.01423187181353569,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.028463751077651978,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 178
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 982.0,
      "completions/max_terminated_length": 876.5,
      "completions/mean_length": 672.703125,
      "completions/mean_terminated_length": 656.7687835693359,
      "completions/min_length": 471.25,
      "completions/min_terminated_length": 471.25,
      "epoch": 0.059666666666666666,
      "grad_norm": 0.6989478468894958,
      "kl": 0.08056640625,
      "learning_rate": 1.186666666666667e-05,
      "loss": 0.0736,
      "num_tokens": 8781163.0,
      "reward": 1.0119648873806,
      "reward_std": 0.20964388456195593,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.1875,
      "rewards/penalized_accuracy_reward/mean": 0.03527218475937843,
      "rewards/penalized_accuracy_reward/std": 0.14108875393867493,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.98046875,
      "rewards/tag_count_reward/std": 0.078125,
      "step": 179
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 947.75,
      "completions/max_terminated_length": 932.25,
      "completions/mean_length": 661.421875,
      "completions/mean_terminated_length": 640.4619293212891,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 0.06,
      "grad_norm": 0.6695852279663086,
      "kl": 0.0841064453125,
      "learning_rate": 1.1933333333333335e-05,
      "loss": 0.0695,
      "num_tokens": 8836006.0,
      "reward": 0.963020846247673,
      "reward_std": 0.10263045411556959,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.21039126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9791666865348816,
      "rewards/reasoning_steps_reward/std": 0.07013041526079178,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.05259781517088413,
      "step": 180
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 914.75,
      "completions/max_terminated_length": 914.75,
      "completions/mean_length": 618.546875,
      "completions/mean_terminated_length": 618.546875,
      "completions/min_length": 387.5,
      "completions/min_terminated_length": 387.5,
      "epoch": 0.060333333333333336,
      "grad_norm": 0.6475189924240112,
      "kl": 0.1094970703125,
      "learning_rate": 1.2e-05,
      "loss": -0.0183,
      "num_tokens": 8887369.0,
      "reward": 1.093987837433815,
      "reward_std": 0.3499455749988556,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.10440447181463242,
      "rewards/penalized_accuracy_reward/std": 0.3267183154821396,
      "rewards/reasoning_steps_reward/mean": 0.9791666865348816,
      "rewards/reasoning_steps_reward/std": 0.0833333283662796,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 181
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 635.25,
      "completions/max_terminated_length": 635.25,
      "completions/mean_length": 464.8125,
      "completions/mean_terminated_length": 464.8125,
      "completions/min_length": 333.5,
      "completions/min_terminated_length": 333.5,
      "epoch": 0.06066666666666667,
      "grad_norm": 0.7688916325569153,
      "kl": 0.1220703125,
      "learning_rate": 1.206666666666667e-05,
      "loss": -0.0231,
      "num_tokens": 8927805.0,
      "reward": 0.9999911040067673,
      "reward_std": 0.1609197175130248,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.023428570479154587,
      "rewards/penalized_accuracy_reward/std": 0.09371428191661835,
      "rewards/reasoning_steps_reward/mean": 0.9531250149011612,
      "rewards/reasoning_steps_reward/std": 0.17032546550035477,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 182
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 737.5,
      "completions/max_terminated_length": 737.5,
      "completions/mean_length": 523.21875,
      "completions/mean_terminated_length": 523.21875,
      "completions/min_length": 291.5,
      "completions/min_terminated_length": 291.5,
      "epoch": 0.061,
      "grad_norm": 0.7014211416244507,
      "kl": 0.1153564453125,
      "learning_rate": 1.2133333333333335e-05,
      "loss": -0.0261,
      "num_tokens": 8970027.0,
      "reward": 1.2516676783561707,
      "reward_std": 0.3363241720944643,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.2725009620189667,
      "rewards/penalized_accuracy_reward/std": 0.2822096049785614,
      "rewards/reasoning_steps_reward/mean": 0.9583333432674408,
      "rewards/reasoning_steps_reward/std": 0.13775940239429474,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 183
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 844.5,
      "completions/max_terminated_length": 844.5,
      "completions/mean_length": 555.703125,
      "completions/mean_terminated_length": 555.703125,
      "completions/min_length": 349.75,
      "completions/min_terminated_length": 349.75,
      "epoch": 0.06133333333333333,
      "grad_norm": 0.5648781061172485,
      "kl": 0.08984375,
      "learning_rate": 1.22e-05,
      "loss": 0.0239,
      "num_tokens": 9017016.0,
      "reward": 1.2053382843732834,
      "reward_std": 0.3726501800119877,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.22760391235351562,
      "rewards/penalized_accuracy_reward/std": 0.34619054943323135,
      "rewards/reasoning_steps_reward/mean": 0.96875,
      "rewards/reasoning_steps_reward/std": 0.09065093845129013,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 184
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 825.75,
      "completions/max_terminated_length": 814.0,
      "completions/mean_length": 582.90625,
      "completions/mean_terminated_length": 579.5052185058594,
      "completions/min_length": 347.25,
      "completions/min_terminated_length": 347.25,
      "epoch": 0.06166666666666667,
      "grad_norm": 0.7277988195419312,
      "kl": 0.1080322265625,
      "learning_rate": 1.2266666666666667e-05,
      "loss": -0.0195,
      "num_tokens": 9062834.0,
      "reward": 1.0418638736009598,
      "reward_std": 0.2697664760053158,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.0689472034573555,
      "rewards/penalized_accuracy_reward/std": 0.18844179809093475,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.08699213340878487,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.0625,
      "step": 185
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 900.75,
      "completions/max_terminated_length": 900.75,
      "completions/mean_length": 635.71875,
      "completions/mean_terminated_length": 635.71875,
      "completions/min_length": 447.5,
      "completions/min_terminated_length": 447.5,
      "epoch": 0.062,
      "grad_norm": 0.35816872119903564,
      "kl": 0.1043701171875,
      "learning_rate": 1.2333333333333334e-05,
      "loss": 0.014,
      "num_tokens": 9112192.0,
      "reward": 0.9973958432674408,
      "reward_std": 0.010416663251817226,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 186
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 969.0,
      "completions/max_terminated_length": 957.75,
      "completions/mean_length": 775.0625,
      "completions/mean_terminated_length": 769.0535736083984,
      "completions/min_length": 553.75,
      "completions/min_terminated_length": 553.75,
      "epoch": 0.06233333333333333,
      "grad_norm": 0.48311498761177063,
      "kl": 0.1060791015625,
      "learning_rate": 1.2400000000000002e-05,
      "loss": 0.0115,
      "num_tokens": 9172964.0,
      "reward": 0.9718749970197678,
      "reward_std": 0.07948593609035015,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.14789126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.04929708316922188,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.049619100987911224,
      "step": 187
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 898.0,
      "completions/max_terminated_length": 857.0,
      "completions/mean_length": 708.859375,
      "completions/mean_terminated_length": 704.3979187011719,
      "completions/min_length": 497.25,
      "completions/min_terminated_length": 497.25,
      "epoch": 0.06266666666666666,
      "grad_norm": 0.5689432621002197,
      "kl": 0.1102294921875,
      "learning_rate": 1.2466666666666667e-05,
      "loss": 0.0226,
      "num_tokens": 9228219.0,
      "reward": 1.0231836587190628,
      "reward_std": 0.16981789749115705,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.032819055020809174,
      "rewards/penalized_accuracy_reward/std": 0.1312762200832367,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 188
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 1018.25,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 846.1875,
      "completions/mean_terminated_length": 795.1122283935547,
      "completions/min_length": 552.0,
      "completions/min_terminated_length": 552.0,
      "epoch": 0.063,
      "grad_norm": 0.5394105911254883,
      "kl": 0.095703125,
      "learning_rate": 1.2533333333333336e-05,
      "loss": 0.0808,
      "num_tokens": 9292071.0,
      "reward": 0.9207031279802322,
      "reward_std": 0.12802047468721867,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.27699070423841476,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.89453125,
      "rewards/tag_count_reward/std": 0.18513670563697815,
      "step": 189
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 981.5,
      "completions/max_terminated_length": 957.5,
      "completions/mean_length": 824.4375,
      "completions/mean_terminated_length": 787.9070129394531,
      "completions/min_length": 592.0,
      "completions/min_terminated_length": 592.0,
      "epoch": 0.06333333333333334,
      "grad_norm": 0.6665021777153015,
      "kl": 0.0994873046875,
      "learning_rate": 1.2600000000000001e-05,
      "loss": 0.0531,
      "num_tokens": 9355107.0,
      "reward": 0.931629091501236,
      "reward_std": 0.2687137797474861,
      "rewards/format_reward/mean": 0.78125,
      "rewards/format_reward/std": 0.2675696536898613,
      "rewards/penalized_accuracy_reward/mean": 0.0352749302983284,
      "rewards/penalized_accuracy_reward/std": 0.1410997211933136,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.890625,
      "rewards/tag_count_reward/std": 0.15779344737529755,
      "step": 190
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 899.25,
      "completions/max_terminated_length": 899.25,
      "completions/mean_length": 663.390625,
      "completions/mean_terminated_length": 663.390625,
      "completions/min_length": 487.75,
      "completions/min_terminated_length": 487.75,
      "epoch": 0.06366666666666666,
      "grad_norm": 0.3484903872013092,
      "kl": 0.0921630859375,
      "learning_rate": 1.2666666666666667e-05,
      "loss": -0.004,
      "num_tokens": 9412332.0,
      "reward": 1.1529858112335205,
      "reward_std": 0.27366939187049866,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.15298579633235931,
      "rewards/penalized_accuracy_reward/std": 0.27366936206817627,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 191
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 987.75,
      "completions/max_terminated_length": 967.5,
      "completions/mean_length": 771.890625,
      "completions/mean_terminated_length": 761.0098266601562,
      "completions/min_length": 558.0,
      "completions/min_terminated_length": 558.0,
      "epoch": 0.064,
      "grad_norm": 0.47288382053375244,
      "kl": 0.1241455078125,
      "learning_rate": 1.2733333333333336e-05,
      "loss": 0.0468,
      "num_tokens": 9473109.0,
      "reward": 0.9766927063465118,
      "reward_std": 0.0663717407733202,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.14789126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.98046875,
      "rewards/tag_count_reward/std": 0.06403729319572449,
      "step": 192
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 884.75,
      "completions/max_terminated_length": 884.75,
      "completions/mean_length": 631.03125,
      "completions/mean_terminated_length": 631.03125,
      "completions/min_length": 373.0,
      "completions/min_terminated_length": 373.0,
      "epoch": 0.06433333333333334,
      "grad_norm": 0.6051411032676697,
      "kl": 0.0982666015625,
      "learning_rate": 1.2800000000000001e-05,
      "loss": -0.0479,
      "num_tokens": 9522567.0,
      "reward": 1.0272425264120102,
      "reward_std": 0.23006188031286,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.053284166380763054,
      "rewards/penalized_accuracy_reward/std": 0.21313666552305222,
      "rewards/reasoning_steps_reward/mean": 0.9479166865348816,
      "rewards/reasoning_steps_reward/std": 0.14323293417692184,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 193
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 893.25,
      "completions/max_terminated_length": 893.25,
      "completions/mean_length": 694.5,
      "completions/mean_terminated_length": 694.5,
      "completions/min_length": 479.5,
      "completions/min_terminated_length": 479.5,
      "epoch": 0.06466666666666666,
      "grad_norm": 0.6984221339225769,
      "kl": 0.107421875,
      "learning_rate": 1.2866666666666667e-05,
      "loss": -0.0016,
      "num_tokens": 9580599.0,
      "reward": 1.1104334741830826,
      "reward_std": 0.30629251059144735,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.1234542764723301,
      "rewards/penalized_accuracy_reward/std": 0.2720722556114197,
      "rewards/reasoning_steps_reward/mean": 0.973958358168602,
      "rewards/reasoning_steps_reward/std": 0.1041666604578495,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 194
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 800.25,
      "completions/max_terminated_length": 800.25,
      "completions/mean_length": 615.984375,
      "completions/mean_terminated_length": 615.984375,
      "completions/min_length": 402.0,
      "completions/min_terminated_length": 402.0,
      "epoch": 0.065,
      "grad_norm": 0.4620238244533539,
      "kl": 0.1046142578125,
      "learning_rate": 1.2933333333333334e-05,
      "loss": -0.0052,
      "num_tokens": 9628998.0,
      "reward": 0.9947916865348816,
      "reward_std": 0.020833326503634453,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 195
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 886.0,
      "completions/max_terminated_length": 833.75,
      "completions/mean_length": 615.359375,
      "completions/mean_terminated_length": 605.1760559082031,
      "completions/min_length": 430.25,
      "completions/min_terminated_length": 430.25,
      "epoch": 0.06533333333333333,
      "grad_norm": 0.5173254013061523,
      "kl": 0.116943359375,
      "learning_rate": 1.3000000000000001e-05,
      "loss": 0.0173,
      "num_tokens": 9678045.0,
      "reward": 0.9632812589406967,
      "reward_std": 0.08124083653092384,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9531250149011612,
      "rewards/reasoning_steps_reward/std": 0.1219017468392849,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 196
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 746.75,
      "completions/max_terminated_length": 746.75,
      "completions/mean_length": 528.8125,
      "completions/mean_terminated_length": 528.8125,
      "completions/min_length": 342.25,
      "completions/min_terminated_length": 342.25,
      "epoch": 0.06566666666666666,
      "grad_norm": 0.796053946018219,
      "kl": 0.1429443359375,
      "learning_rate": 1.3066666666666668e-05,
      "loss": -0.0489,
      "num_tokens": 9726577.0,
      "reward": 1.0946148484945297,
      "reward_std": 0.3945549316704273,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.11427633091807365,
      "rewards/penalized_accuracy_reward/std": 0.3603511452674866,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.0727677047252655,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 197
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 629.25,
      "completions/max_terminated_length": 629.25,
      "completions/mean_length": 489.96875,
      "completions/mean_terminated_length": 489.96875,
      "completions/min_length": 350.5,
      "completions/min_terminated_length": 350.5,
      "epoch": 0.066,
      "grad_norm": 0.688350260257721,
      "kl": 0.1165771484375,
      "learning_rate": 1.3133333333333334e-05,
      "loss": -0.0299,
      "num_tokens": 9768255.0,
      "reward": 1.218723639845848,
      "reward_std": 0.22623951733112335,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.24216115474700928,
      "rewards/penalized_accuracy_reward/std": 0.19652985036373138,
      "rewards/reasoning_steps_reward/mean": 0.9531250149011612,
      "rewards/reasoning_steps_reward/std": 0.11016901582479477,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 198
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 645.5,
      "completions/max_terminated_length": 645.5,
      "completions/mean_length": 484.90625,
      "completions/mean_terminated_length": 484.90625,
      "completions/min_length": 348.25,
      "completions/min_terminated_length": 348.25,
      "epoch": 0.06633333333333333,
      "grad_norm": 0.7640448212623596,
      "kl": 0.106689453125,
      "learning_rate": 1.3200000000000002e-05,
      "loss": -0.034,
      "num_tokens": 9808185.0,
      "reward": 1.189970687031746,
      "reward_std": 0.30377752613276243,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.22903314232826233,
      "rewards/penalized_accuracy_reward/std": 0.23654542863368988,
      "rewards/reasoning_steps_reward/mean": 0.9218750149011612,
      "rewards/reasoning_steps_reward/std": 0.20916644483804703,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 199
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 732.0,
      "completions/max_terminated_length": 732.0,
      "completions/mean_length": 534.609375,
      "completions/mean_terminated_length": 534.609375,
      "completions/min_length": 399.5,
      "completions/min_terminated_length": 399.5,
      "epoch": 0.06666666666666667,
      "grad_norm": 0.7903880476951599,
      "kl": 0.1226806640625,
      "learning_rate": 1.3266666666666668e-05,
      "loss": 0.019,
      "num_tokens": 9852080.0,
      "reward": 1.1319045722484589,
      "reward_std": 0.28382984828203917,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.16315454244613647,
      "rewards/penalized_accuracy_reward/std": 0.2500864267349243,
      "rewards/reasoning_steps_reward/mean": 0.9375000149011612,
      "rewards/reasoning_steps_reward/std": 0.15744590014219284,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 200
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 801.5,
      "completions/max_terminated_length": 801.5,
      "completions/mean_length": 615.65625,
      "completions/mean_terminated_length": 615.65625,
      "completions/min_length": 399.75,
      "completions/min_terminated_length": 399.75,
      "epoch": 0.067,
      "grad_norm": 0.6389729380607605,
      "kl": 0.121337890625,
      "learning_rate": 1.3333333333333333e-05,
      "loss": -0.0109,
      "num_tokens": 9899770.0,
      "reward": 1.1019178926944733,
      "reward_std": 0.2688593650236726,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.11233451962471008,
      "rewards/penalized_accuracy_reward/std": 0.24164631962776184,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.05442607030272484,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 201
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 857.5,
      "completions/max_terminated_length": 857.5,
      "completions/mean_length": 635.640625,
      "completions/mean_terminated_length": 635.640625,
      "completions/min_length": 443.5,
      "completions/min_terminated_length": 443.5,
      "epoch": 0.06733333333333333,
      "grad_norm": 0.5509011745452881,
      "kl": 0.124755859375,
      "learning_rate": 1.3400000000000002e-05,
      "loss": 0.0041,
      "num_tokens": 9951955.0,
      "reward": 0.9807291626930237,
      "reward_std": 0.054120369255542755,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.08699213340878487,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 202
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 738.75,
      "completions/max_terminated_length": 738.75,
      "completions/mean_length": 535.328125,
      "completions/mean_terminated_length": 535.328125,
      "completions/min_length": 364.5,
      "completions/min_terminated_length": 364.5,
      "epoch": 0.06766666666666667,
      "grad_norm": 0.527125358581543,
      "kl": 0.154052734375,
      "learning_rate": 1.3466666666666668e-05,
      "loss": 0.012,
      "num_tokens": 10002312.0,
      "reward": 1.4380362629890442,
      "reward_std": 0.471890464425087,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.4484528675675392,
      "rewards/penalized_accuracy_reward/std": 0.45712582767009735,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.0833333320915699,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 203
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 901.5,
      "completions/max_terminated_length": 901.5,
      "completions/mean_length": 578.421875,
      "completions/mean_terminated_length": 578.421875,
      "completions/min_length": 352.25,
      "completions/min_terminated_length": 352.25,
      "epoch": 0.068,
      "grad_norm": 0.48045891523361206,
      "kl": 0.127685546875,
      "learning_rate": 1.3533333333333333e-05,
      "loss": -0.0418,
      "num_tokens": 10048499.0,
      "reward": 1.0155680775642395,
      "reward_std": 0.1726234508678317,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.03379720076918602,
      "rewards/penalized_accuracy_reward/std": 0.13518880307674408,
      "rewards/reasoning_steps_reward/mean": 0.9635416865348816,
      "rewards/reasoning_steps_reward/std": 0.11061252281069756,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 204
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 975.75,
      "completions/max_terminated_length": 966.5,
      "completions/mean_length": 748.6875,
      "completions/mean_terminated_length": 729.5687713623047,
      "completions/min_length": 463.25,
      "completions/min_terminated_length": 463.25,
      "epoch": 0.06833333333333333,
      "grad_norm": 0.6406579613685608,
      "kl": 0.1331787109375,
      "learning_rate": 1.3600000000000002e-05,
      "loss": 0.0482,
      "num_tokens": 10108911.0,
      "reward": 0.9862234890460968,
      "reward_std": 0.21676897443830967,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.1875,
      "rewards/penalized_accuracy_reward/mean": 0.03348911926150322,
      "rewards/penalized_accuracy_reward/std": 0.13395647704601288,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.96484375,
      "rewards/tag_count_reward/std": 0.078125,
      "step": 205
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 913.25,
      "completions/max_terminated_length": 869.75,
      "completions/mean_length": 655.359375,
      "completions/mean_terminated_length": 646.7114715576172,
      "completions/min_length": 412.0,
      "completions/min_terminated_length": 412.0,
      "epoch": 0.06866666666666667,
      "grad_norm": 0.699122965335846,
      "kl": 0.1441650390625,
      "learning_rate": 1.3666666666666667e-05,
      "loss": -0.0187,
      "num_tokens": 10160166.0,
      "reward": 1.2349582314491272,
      "reward_std": 0.5138601027429104,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.25,
      "rewards/penalized_accuracy_reward/mean": 0.2742811441421509,
      "rewards/penalized_accuracy_reward/std": 0.4517088681459427,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.06454972177743912,
      "rewards/tag_count_reward/mean": 0.9609375,
      "rewards/tag_count_reward/std": 0.15625,
      "step": 206
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 989.5,
      "completions/max_terminated_length": 881.75,
      "completions/mean_length": 710.875,
      "completions/mean_terminated_length": 677.321891784668,
      "completions/min_length": 426.25,
      "completions/min_terminated_length": 426.25,
      "epoch": 0.069,
      "grad_norm": 0.678071141242981,
      "kl": 0.1116943359375,
      "learning_rate": 1.3733333333333335e-05,
      "loss": 0.0659,
      "num_tokens": 10216158.0,
      "reward": 0.9995495826005936,
      "reward_std": 0.30108168721199036,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.29398179799318314,
      "rewards/penalized_accuracy_reward/mean": 0.07025270164012909,
      "rewards/penalized_accuracy_reward/std": 0.19196806848049164,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.91796875,
      "rewards/tag_count_reward/std": 0.15412572026252747,
      "step": 207
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 969.75,
      "completions/mean_length": 803.390625,
      "completions/mean_terminated_length": 779.1112823486328,
      "completions/min_length": 575.5,
      "completions/min_terminated_length": 575.5,
      "epoch": 0.06933333333333333,
      "grad_norm": 0.6706548929214478,
      "kl": 0.1131591796875,
      "learning_rate": 1.38e-05,
      "loss": 0.058,
      "num_tokens": 10279543.0,
      "reward": 1.1049699932336807,
      "reward_std": 0.3561432473361492,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.31116948276758194,
      "rewards/penalized_accuracy_reward/mean": 0.15562105178833008,
      "rewards/penalized_accuracy_reward/std": 0.23856909573078156,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.95703125,
      "rewards/tag_count_reward/std": 0.129237312823534,
      "step": 208
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 955.0,
      "completions/max_terminated_length": 922.0,
      "completions/mean_length": 654.59375,
      "completions/mean_terminated_length": 649.652099609375,
      "completions/min_length": 436.25,
      "completions/min_terminated_length": 436.25,
      "epoch": 0.06966666666666667,
      "grad_norm": 0.5483206510543823,
      "kl": 0.128173828125,
      "learning_rate": 1.3866666666666669e-05,
      "loss": 0.0133,
      "num_tokens": 10330861.0,
      "reward": 1.0247620195150375,
      "reward_std": 0.17300635110586882,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.03400679677724838,
      "rewards/penalized_accuracy_reward/std": 0.13602718710899353,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 209
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 922.75,
      "completions/max_terminated_length": 900.25,
      "completions/mean_length": 653.25,
      "completions/mean_terminated_length": 648.4000091552734,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 0.07,
      "grad_norm": 0.5754551291465759,
      "kl": 0.11962890625,
      "learning_rate": 1.3933333333333334e-05,
      "loss": 0.0279,
      "num_tokens": 10381597.0,
      "reward": 1.32256717979908,
      "reward_std": 0.33077580854296684,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.3414474129676819,
      "rewards/penalized_accuracy_reward/std": 0.311147004365921,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 210
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 784.75,
      "completions/max_terminated_length": 784.75,
      "completions/mean_length": 588.21875,
      "completions/mean_terminated_length": 588.21875,
      "completions/min_length": 399.5,
      "completions/min_terminated_length": 399.5,
      "epoch": 0.07033333333333333,
      "grad_norm": 0.3139912486076355,
      "kl": 0.1322021484375,
      "learning_rate": 1.4e-05,
      "loss": 0.0098,
      "num_tokens": 10429323.0,
      "reward": 0.9921875,
      "reward_std": 0.022662732750177383,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.045325469225645065,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 211
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 847.5,
      "completions/max_terminated_length": 847.5,
      "completions/mean_length": 609.25,
      "completions/mean_terminated_length": 609.25,
      "completions/min_length": 376.5,
      "completions/min_terminated_length": 376.5,
      "epoch": 0.07066666666666667,
      "grad_norm": 0.7940312027931213,
      "kl": 0.1170654296875,
      "learning_rate": 1.4066666666666669e-05,
      "loss": -0.0041,
      "num_tokens": 10481179.0,
      "reward": 1.0844275057315826,
      "reward_std": 0.3520566299557686,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.10109420865774155,
      "rewards/penalized_accuracy_reward/std": 0.319944828748703,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.049619100987911224,
      "step": 212
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 751.25,
      "completions/max_terminated_length": 751.25,
      "completions/mean_length": 569.296875,
      "completions/mean_terminated_length": 569.296875,
      "completions/min_length": 402.5,
      "completions/min_terminated_length": 402.5,
      "epoch": 0.071,
      "grad_norm": 0.3268507421016693,
      "kl": 0.1513671875,
      "learning_rate": 1.4133333333333334e-05,
      "loss": 0.006,
      "num_tokens": 10529582.0,
      "reward": 0.9973958432674408,
      "reward_std": 0.010416663251817226,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 213
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 686.0,
      "completions/max_terminated_length": 686.0,
      "completions/mean_length": 497.484375,
      "completions/mean_terminated_length": 497.484375,
      "completions/min_length": 295.25,
      "completions/min_terminated_length": 295.25,
      "epoch": 0.07133333333333333,
      "grad_norm": 0.8682353496551514,
      "kl": 0.1650390625,
      "learning_rate": 1.4200000000000001e-05,
      "loss": 0.033,
      "num_tokens": 10576925.0,
      "reward": 1.4779804199934006,
      "reward_std": 0.5776420421898365,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.4968606233596802,
      "rewards/penalized_accuracy_reward/std": 0.5501919239759445,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.028463751077651978,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 214
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 707.25,
      "completions/max_terminated_length": 707.25,
      "completions/mean_length": 487.40625,
      "completions/mean_terminated_length": 487.40625,
      "completions/min_length": 299.0,
      "completions/min_terminated_length": 299.0,
      "epoch": 0.07166666666666667,
      "grad_norm": 0.6823318600654602,
      "kl": 0.1435546875,
      "learning_rate": 1.4266666666666668e-05,
      "loss": -0.0286,
      "num_tokens": 10615719.0,
      "reward": 1.2218518257141113,
      "reward_std": 0.3399867806583643,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.24672159552574158,
      "rewards/penalized_accuracy_reward/std": 0.2893291413784027,
      "rewards/reasoning_steps_reward/mean": 0.9635416716337204,
      "rewards/reasoning_steps_reward/std": 0.12865880131721497,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 215
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 803.75,
      "completions/max_terminated_length": 803.75,
      "completions/mean_length": 544.71875,
      "completions/mean_terminated_length": 544.71875,
      "completions/min_length": 350.75,
      "completions/min_terminated_length": 350.75,
      "epoch": 0.072,
      "grad_norm": 0.5547329783439636,
      "kl": 0.165771484375,
      "learning_rate": 1.4333333333333334e-05,
      "loss": -0.0017,
      "num_tokens": 10659861.0,
      "reward": 1.2048795819282532,
      "reward_std": 0.3519400358200073,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.20487959496676922,
      "rewards/penalized_accuracy_reward/std": 0.35194002091884613,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 216
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 763.0,
      "completions/max_terminated_length": 763.0,
      "completions/mean_length": 544.296875,
      "completions/mean_terminated_length": 544.296875,
      "completions/min_length": 341.75,
      "completions/min_terminated_length": 341.75,
      "epoch": 0.07233333333333333,
      "grad_norm": 0.8118423819541931,
      "kl": 0.131591796875,
      "learning_rate": 1.4400000000000001e-05,
      "loss": -0.0361,
      "num_tokens": 10703416.0,
      "reward": 1.0395066440105438,
      "reward_std": 0.3176156934350729,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.06698058173060417,
      "rewards/penalized_accuracy_reward/std": 0.2679223269224167,
      "rewards/reasoning_steps_reward/mean": 0.958333358168602,
      "rewards/reasoning_steps_reward/std": 0.09419529885053635,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 217
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 801.0,
      "completions/max_terminated_length": 801.0,
      "completions/mean_length": 573.734375,
      "completions/mean_terminated_length": 573.734375,
      "completions/min_length": 383.25,
      "completions/min_terminated_length": 383.25,
      "epoch": 0.07266666666666667,
      "grad_norm": 0.3515082001686096,
      "kl": 0.1280517578125,
      "learning_rate": 1.4466666666666668e-05,
      "loss": 0.0054,
      "num_tokens": 10748999.0,
      "reward": 0.9921875,
      "reward_std": 0.03125,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.0625,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 218
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 810.0,
      "completions/max_terminated_length": 810.0,
      "completions/mean_length": 628.140625,
      "completions/mean_terminated_length": 628.140625,
      "completions/min_length": 400.5,
      "completions/min_terminated_length": 400.5,
      "epoch": 0.073,
      "grad_norm": 0.5115145444869995,
      "kl": 0.146728515625,
      "learning_rate": 1.4533333333333335e-05,
      "loss": -0.0195,
      "num_tokens": 10798048.0,
      "reward": 0.989583358168602,
      "reward_std": 0.03506519831717014,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9791666865348816,
      "rewards/reasoning_steps_reward/std": 0.07013041526079178,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 219
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 885.25,
      "completions/max_terminated_length": 874.0,
      "completions/mean_length": 538.0625,
      "completions/mean_terminated_length": 529.6958389282227,
      "completions/min_length": 195.5,
      "completions/min_terminated_length": 195.5,
      "epoch": 0.07333333333333333,
      "grad_norm": 0.7975037097930908,
      "kl": 0.14453125,
      "learning_rate": 1.46e-05,
      "loss": -0.0692,
      "num_tokens": 10846804.0,
      "reward": 1.0960131287574768,
      "reward_std": 0.4025039039552212,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.12348708137869835,
      "rewards/penalized_accuracy_reward/std": 0.3375791162252426,
      "rewards/reasoning_steps_reward/mean": 0.9583333432674408,
      "rewards/reasoning_steps_reward/std": 0.14949213340878487,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 220
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 844.75,
      "completions/max_terminated_length": 825.5,
      "completions/mean_length": 509.5,
      "completions/mean_terminated_length": 503.57188415527344,
      "completions/min_length": 176.0,
      "completions/min_terminated_length": 176.0,
      "epoch": 0.07366666666666667,
      "grad_norm": 0.48976442217826843,
      "kl": 0.140380859375,
      "learning_rate": 1.4666666666666666e-05,
      "loss": 0.0123,
      "num_tokens": 10888356.0,
      "reward": 1.0810218900442123,
      "reward_std": 0.21577799692749977,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.09026668965816498,
      "rewards/penalized_accuracy_reward/std": 0.1944141685962677,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 221
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 806.0,
      "completions/max_terminated_length": 806.0,
      "completions/mean_length": 577.078125,
      "completions/mean_terminated_length": 577.078125,
      "completions/min_length": 369.5,
      "completions/min_terminated_length": 369.5,
      "epoch": 0.074,
      "grad_norm": 0.40557998418807983,
      "kl": 0.16455078125,
      "learning_rate": 1.4733333333333335e-05,
      "loss": 0.0062,
      "num_tokens": 10935321.0,
      "reward": 1.033362090587616,
      "reward_std": 0.13344836235046387,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.03336208686232567,
      "rewards/penalized_accuracy_reward/std": 0.13344834744930267,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 222
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 897.75,
      "completions/max_terminated_length": 883.75,
      "completions/mean_length": 655.078125,
      "completions/mean_terminated_length": 647.2142944335938,
      "completions/min_length": 329.0,
      "completions/min_terminated_length": 329.0,
      "epoch": 0.07433333333333333,
      "grad_norm": 0.6898764967918396,
      "kl": 0.155517578125,
      "learning_rate": 1.48e-05,
      "loss": 0.0313,
      "num_tokens": 10985406.0,
      "reward": 1.0111503452062607,
      "reward_std": 0.15717457351274788,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.08539126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.028598245233297348,
      "rewards/penalized_accuracy_reward/std": 0.11439298838376999,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.9765625,
      "rewards/tag_count_reward/std": 0.07966229319572449,
      "step": 223
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 893.25,
      "completions/max_terminated_length": 889.75,
      "completions/mean_length": 664.59375,
      "completions/mean_terminated_length": 660.8270874023438,
      "completions/min_length": 344.5,
      "completions/min_terminated_length": 344.5,
      "epoch": 0.07466666666666667,
      "grad_norm": 0.41226571798324585,
      "kl": 0.162841796875,
      "learning_rate": 1.4866666666666668e-05,
      "loss": -0.0158,
      "num_tokens": 11037940.0,
      "reward": 0.9873698055744171,
      "reward_std": 0.04121637064963579,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 224
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 920.25,
      "completions/max_terminated_length": 895.5,
      "completions/mean_length": 717.703125,
      "completions/mean_terminated_length": 703.4521179199219,
      "completions/min_length": 513.75,
      "completions/min_terminated_length": 513.75,
      "epoch": 0.075,
      "grad_norm": 0.6623892188072205,
      "kl": 0.146728515625,
      "learning_rate": 1.4933333333333335e-05,
      "loss": 0.0225,
      "num_tokens": 11091969.0,
      "reward": 0.9588541686534882,
      "reward_std": 0.12408353574573994,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.21039126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.058679524809122086,
      "rewards/tag_count_reward/mean": 0.96875,
      "rewards/tag_count_reward/std": 0.11091229319572449,
      "step": 225
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 874.25,
      "completions/max_terminated_length": 874.25,
      "completions/mean_length": 585.78125,
      "completions/mean_terminated_length": 585.78125,
      "completions/min_length": 329.25,
      "completions/min_terminated_length": 329.25,
      "epoch": 0.07533333333333334,
      "grad_norm": 0.5859063863754272,
      "kl": 0.143798828125,
      "learning_rate": 1.5000000000000002e-05,
      "loss": 0.0193,
      "num_tokens": 11138291.0,
      "reward": 1.2298401892185211,
      "reward_std": 0.44086357951164246,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.2428610548377037,
      "rewards/penalized_accuracy_reward/std": 0.4119364768266678,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.1041666641831398,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 226
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 897.75,
      "completions/max_terminated_length": 897.75,
      "completions/mean_length": 602.875,
      "completions/mean_terminated_length": 602.875,
      "completions/min_length": 370.0,
      "completions/min_terminated_length": 370.0,
      "epoch": 0.07566666666666666,
      "grad_norm": 0.48775744438171387,
      "kl": 0.15869140625,
      "learning_rate": 1.5066666666666668e-05,
      "loss": 0.0067,
      "num_tokens": 11185931.0,
      "reward": 1.0926258563995361,
      "reward_std": 0.2955952286720276,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.09262584149837494,
      "rewards/penalized_accuracy_reward/std": 0.2955952137708664,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 227
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 892.75,
      "completions/max_terminated_length": 887.25,
      "completions/mean_length": 649.046875,
      "completions/mean_terminated_length": 635.8930358886719,
      "completions/min_length": 361.5,
      "completions/min_terminated_length": 361.5,
      "epoch": 0.076,
      "grad_norm": 0.5001335740089417,
      "kl": 0.173095703125,
      "learning_rate": 1.5133333333333335e-05,
      "loss": 0.0474,
      "num_tokens": 11241678.0,
      "reward": 0.9748698025941849,
      "reward_std": 0.06366407126188278,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.10077822208404541,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.025194555521011353,
      "step": 228
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 891.25,
      "completions/max_terminated_length": 854.25,
      "completions/mean_length": 624.65625,
      "completions/mean_terminated_length": 614.6187744140625,
      "completions/min_length": 326.0,
      "completions/min_terminated_length": 326.0,
      "epoch": 0.07633333333333334,
      "grad_norm": 0.47809383273124695,
      "kl": 0.1624755859375,
      "learning_rate": 1.5200000000000002e-05,
      "loss": -0.0097,
      "num_tokens": 11291640.0,
      "reward": 1.1923899203538895,
      "reward_std": 0.3330077975988388,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.21686908602714539,
      "rewards/penalized_accuracy_reward/std": 0.28924980759620667,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.0833333320915699,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.0625,
      "step": 229
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 843.5,
      "completions/max_terminated_length": 843.5,
      "completions/mean_length": 557.65625,
      "completions/mean_terminated_length": 557.65625,
      "completions/min_length": 289.25,
      "completions/min_terminated_length": 289.25,
      "epoch": 0.07666666666666666,
      "grad_norm": 0.5777904987335205,
      "kl": 0.12939453125,
      "learning_rate": 1.5266666666666667e-05,
      "loss": -0.0835,
      "num_tokens": 11338354.0,
      "reward": 0.9869791865348816,
      "reward_std": 0.043496059253811836,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.08699213340878487,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 230
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 847.75,
      "completions/max_terminated_length": 777.75,
      "completions/mean_length": 503.15625,
      "completions/mean_terminated_length": 495.65834045410156,
      "completions/min_length": 303.0,
      "completions/min_terminated_length": 303.0,
      "epoch": 0.077,
      "grad_norm": 0.49381595849990845,
      "kl": 0.149169921875,
      "learning_rate": 1.5333333333333334e-05,
      "loss": 0.0168,
      "num_tokens": 11380140.0,
      "reward": 0.9829427152872086,
      "reward_std": 0.050058203749358654,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.0833333320915699,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 231
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 814.75,
      "completions/max_terminated_length": 814.75,
      "completions/mean_length": 585.375,
      "completions/mean_terminated_length": 585.375,
      "completions/min_length": 399.75,
      "completions/min_terminated_length": 399.75,
      "epoch": 0.07733333333333334,
      "grad_norm": 0.5595817565917969,
      "kl": 0.18212890625,
      "learning_rate": 1.54e-05,
      "loss": 0.0464,
      "num_tokens": 11428132.0,
      "reward": 1.1764193773269653,
      "reward_std": 0.33044473826885223,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.17641937546432018,
      "rewards/penalized_accuracy_reward/std": 0.330444760620594,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 232
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 817.5,
      "completions/max_terminated_length": 817.5,
      "completions/mean_length": 592.75,
      "completions/mean_terminated_length": 592.75,
      "completions/min_length": 343.0,
      "completions/min_terminated_length": 343.0,
      "epoch": 0.07766666666666666,
      "grad_norm": 0.547507107257843,
      "kl": 0.152099609375,
      "learning_rate": 1.546666666666667e-05,
      "loss": 0.0036,
      "num_tokens": 11476884.0,
      "reward": 1.026512697339058,
      "reward_std": 0.1592193841934204,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.03432518616318703,
      "rewards/penalized_accuracy_reward/std": 0.1373007446527481,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.0624999962747097,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 233
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 990.25,
      "completions/max_terminated_length": 990.25,
      "completions/mean_length": 614.59375,
      "completions/mean_terminated_length": 614.59375,
      "completions/min_length": 358.5,
      "completions/min_terminated_length": 358.5,
      "epoch": 0.078,
      "grad_norm": 0.4871361553668976,
      "kl": 0.1551513671875,
      "learning_rate": 1.5533333333333333e-05,
      "loss": -0.0094,
      "num_tokens": 11528666.0,
      "reward": 0.9895833432674408,
      "reward_std": 0.03506520017981529,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9791666865348816,
      "rewards/reasoning_steps_reward/std": 0.07013041526079178,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 234
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 848.75,
      "completions/max_terminated_length": 848.75,
      "completions/mean_length": 588.375,
      "completions/mean_terminated_length": 588.375,
      "completions/min_length": 317.5,
      "completions/min_terminated_length": 317.5,
      "epoch": 0.07833333333333334,
      "grad_norm": 0.7485732436180115,
      "kl": 0.16357421875,
      "learning_rate": 1.5600000000000003e-05,
      "loss": -0.0261,
      "num_tokens": 11580130.0,
      "reward": 1.0502691864967346,
      "reward_std": 0.28249461110681295,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.06589415855705738,
      "rewards/penalized_accuracy_reward/std": 0.2635766342282295,
      "rewards/reasoning_steps_reward/mean": 0.9687500149011612,
      "rewards/reasoning_steps_reward/std": 0.1249999962747097,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 235
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 983.0,
      "completions/max_terminated_length": 891.0,
      "completions/mean_length": 640.84375,
      "completions/mean_terminated_length": 627.8812713623047,
      "completions/min_length": 349.75,
      "completions/min_terminated_length": 349.75,
      "epoch": 0.07866666666666666,
      "grad_norm": 0.7337653636932373,
      "kl": 0.158203125,
      "learning_rate": 1.5666666666666667e-05,
      "loss": 0.0663,
      "num_tokens": 11631096.0,
      "reward": 0.9947942942380905,
      "reward_std": 0.16349013429135084,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.021877631545066833,
      "rewards/penalized_accuracy_reward/std": 0.08751052618026733,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.1041666641831398,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.0625,
      "step": 236
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 918.25,
      "completions/max_terminated_length": 862.5,
      "completions/mean_length": 616.828125,
      "completions/mean_terminated_length": 609.6697998046875,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 0.079,
      "grad_norm": 0.6300519108772278,
      "kl": 0.158203125,
      "learning_rate": 1.5733333333333334e-05,
      "loss": 0.0524,
      "num_tokens": 11680093.0,
      "reward": 0.9907552152872086,
      "reward_std": 0.036979163996875286,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 237
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 827.0,
      "completions/max_terminated_length": 827.0,
      "completions/mean_length": 623.21875,
      "completions/mean_terminated_length": 623.21875,
      "completions/min_length": 373.75,
      "completions/min_terminated_length": 373.75,
      "epoch": 0.07933333333333334,
      "grad_norm": 0.5821474194526672,
      "kl": 0.167724609375,
      "learning_rate": 1.58e-05,
      "loss": -0.0171,
      "num_tokens": 11730379.0,
      "reward": 1.0398301482200623,
      "reward_std": 0.18846993148326874,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.055455148220062256,
      "rewards/penalized_accuracy_reward/std": 0.1535360962152481,
      "rewards/reasoning_steps_reward/mean": 0.9687500298023224,
      "rewards/reasoning_steps_reward/std": 0.11179707944393158,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 238
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 794.0,
      "completions/max_terminated_length": 794.0,
      "completions/mean_length": 552.4375,
      "completions/mean_terminated_length": 552.4375,
      "completions/min_length": 286.0,
      "completions/min_terminated_length": 286.0,
      "epoch": 0.07966666666666666,
      "grad_norm": 0.37174347043037415,
      "kl": 0.140869140625,
      "learning_rate": 1.586666666666667e-05,
      "loss": -0.055,
      "num_tokens": 11777623.0,
      "reward": 0.9921875149011612,
      "reward_std": 0.03124999161809683,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.0624999962747097,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 239
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 915.75,
      "completions/max_terminated_length": 915.75,
      "completions/mean_length": 667.171875,
      "completions/mean_terminated_length": 667.171875,
      "completions/min_length": 443.25,
      "completions/min_terminated_length": 443.25,
      "epoch": 0.08,
      "grad_norm": 0.4442844092845917,
      "kl": 0.166748046875,
      "learning_rate": 1.5933333333333336e-05,
      "loss": 0.0116,
      "num_tokens": 11829234.0,
      "reward": 0.9973958432674408,
      "reward_std": 0.010416663251817226,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 240
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 851.0,
      "completions/max_terminated_length": 851.0,
      "completions/mean_length": 587.765625,
      "completions/mean_terminated_length": 587.765625,
      "completions/min_length": 365.0,
      "completions/min_terminated_length": 365.0,
      "epoch": 0.08033333333333334,
      "grad_norm": 0.5305766463279724,
      "kl": 0.156005859375,
      "learning_rate": 1.6000000000000003e-05,
      "loss": -0.0256,
      "num_tokens": 11874867.0,
      "reward": 0.9921875149011612,
      "reward_std": 0.03124999161809683,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.0624999962747097,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 241
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 948.0,
      "completions/max_terminated_length": 900.75,
      "completions/mean_length": 645.859375,
      "completions/mean_terminated_length": 624.7781372070312,
      "completions/min_length": 387.75,
      "completions/min_terminated_length": 387.75,
      "epoch": 0.08066666666666666,
      "grad_norm": 0.5956756472587585,
      "kl": 0.16650390625,
      "learning_rate": 1.606666666666667e-05,
      "loss": 0.0793,
      "num_tokens": 11923994.0,
      "reward": 1.2291074991226196,
      "reward_std": 0.497161440551281,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.17430340498685837,
      "rewards/penalized_accuracy_reward/mean": 0.26309189945459366,
      "rewards/penalized_accuracy_reward/std": 0.476230688393116,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.97265625,
      "rewards/tag_count_reward/std": 0.07482585124671459,
      "step": 242
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 975.75,
      "completions/max_terminated_length": 889.0,
      "completions/mean_length": 740.203125,
      "completions/mean_terminated_length": 730.7469024658203,
      "completions/min_length": 511.0,
      "completions/min_terminated_length": 511.0,
      "epoch": 0.081,
      "grad_norm": 0.6802368760108948,
      "kl": 0.1600341796875,
      "learning_rate": 1.6133333333333334e-05,
      "loss": 0.0122,
      "num_tokens": 11983095.0,
      "reward": 1.0122395604848862,
      "reward_std": 0.15847744420170784,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.03593749925494194,
      "rewards/penalized_accuracy_reward/std": 0.09819994866847992,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.048112526535987854,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 243
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 860.25,
      "completions/max_terminated_length": 860.0,
      "completions/mean_length": 608.390625,
      "completions/mean_terminated_length": 600.5848236083984,
      "completions/min_length": 350.0,
      "completions/min_terminated_length": 350.0,
      "epoch": 0.08133333333333333,
      "grad_norm": 0.6472396850585938,
      "kl": 0.15234375,
      "learning_rate": 1.62e-05,
      "loss": 0.0666,
      "num_tokens": 12032208.0,
      "reward": 1.2856135964393616,
      "reward_std": 0.19508842751383781,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.08539126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.30410313606262207,
      "rewards/penalized_accuracy_reward/std": 0.16097909212112427,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.021347815170884132,
      "step": 244
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 974.75,
      "completions/max_terminated_length": 957.5,
      "completions/mean_length": 700.984375,
      "completions/mean_terminated_length": 691.9833526611328,
      "completions/min_length": 390.0,
      "completions/min_terminated_length": 390.0,
      "epoch": 0.08166666666666667,
      "grad_norm": 0.6965994834899902,
      "kl": 0.1689453125,
      "learning_rate": 1.6266666666666668e-05,
      "loss": 0.0147,
      "num_tokens": 12086783.0,
      "reward": 0.9736979007720947,
      "reward_std": 0.08700593560934067,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.1041666641831398,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 245
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 912.5,
      "completions/max_terminated_length": 907.0,
      "completions/mean_length": 660.859375,
      "completions/mean_terminated_length": 656.5062561035156,
      "completions/min_length": 365.75,
      "completions/min_terminated_length": 365.75,
      "epoch": 0.082,
      "grad_norm": 0.7551724314689636,
      "kl": 0.1632080078125,
      "learning_rate": 1.6333333333333335e-05,
      "loss": 0.0243,
      "num_tokens": 12141766.0,
      "reward": 1.347618505358696,
      "reward_std": 0.7300410941243172,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.36806120723485947,
      "rewards/penalized_accuracy_reward/std": 0.6883421540260315,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.1041666641831398,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 246
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 881.0,
      "completions/max_terminated_length": 881.0,
      "completions/mean_length": 622.515625,
      "completions/mean_terminated_length": 622.515625,
      "completions/min_length": 336.75,
      "completions/min_terminated_length": 336.75,
      "epoch": 0.08233333333333333,
      "grad_norm": 0.4745469093322754,
      "kl": 0.16064453125,
      "learning_rate": 1.64e-05,
      "loss": 0.0349,
      "num_tokens": 12192311.0,
      "reward": 1.404757171869278,
      "reward_std": 0.5057430565357208,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.4073612689971924,
      "rewards/penalized_accuracy_reward/std": 0.5086499452590942,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 247
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 822.75,
      "completions/max_terminated_length": 822.75,
      "completions/mean_length": 583.453125,
      "completions/mean_terminated_length": 583.453125,
      "completions/min_length": 403.5,
      "completions/min_terminated_length": 403.5,
      "epoch": 0.08266666666666667,
      "grad_norm": 0.7214362621307373,
      "kl": 0.17041015625,
      "learning_rate": 1.646666666666667e-05,
      "loss": -0.0147,
      "num_tokens": 12238516.0,
      "reward": 1.0527930855751038,
      "reward_std": 0.23437042441219091,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.07024098187685013,
      "rewards/penalized_accuracy_reward/std": 0.19193504750728607,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.0833333320915699,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.021347815170884132,
      "step": 248
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 854.75,
      "completions/max_terminated_length": 828.75,
      "completions/mean_length": 616.859375,
      "completions/mean_terminated_length": 612.0948028564453,
      "completions/min_length": 371.5,
      "completions/min_terminated_length": 371.5,
      "epoch": 0.083,
      "grad_norm": 0.6579822897911072,
      "kl": 0.1611328125,
      "learning_rate": 1.6533333333333333e-05,
      "loss": 0.0076,
      "num_tokens": 12285963.0,
      "reward": 1.0059796571731567,
      "reward_std": 0.15834287833422422,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.08539126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.027073414996266365,
      "rewards/penalized_accuracy_reward/std": 0.10829365998506546,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.0624999962747097,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.021347815170884132,
      "step": 249
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 777.75,
      "completions/max_terminated_length": 750.0,
      "completions/mean_length": 547.78125,
      "completions/mean_terminated_length": 542.2343902587891,
      "completions/min_length": 345.0,
      "completions/min_terminated_length": 345.0,
      "epoch": 0.08333333333333333,
      "grad_norm": 0.9504538774490356,
      "kl": 0.186279296875,
      "learning_rate": 1.66e-05,
      "loss": 0.0474,
      "num_tokens": 12331069.0,
      "reward": 0.9817708432674408,
      "reward_std": 0.06432939507067204,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.06615880131721497,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.0625,
      "step": 250
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 752.5,
      "completions/max_terminated_length": 752.5,
      "completions/mean_length": 485.703125,
      "completions/mean_terminated_length": 485.703125,
      "completions/min_length": 248.75,
      "completions/min_terminated_length": 248.75,
      "epoch": 0.08366666666666667,
      "grad_norm": 0.9305344223976135,
      "kl": 0.1630859375,
      "learning_rate": 1.6666666666666667e-05,
      "loss": -0.0476,
      "num_tokens": 12374634.0,
      "reward": 1.0772633850574493,
      "reward_std": 0.3262657858431339,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.11111752688884735,
      "rewards/penalized_accuracy_reward/std": 0.23903486132621765,
      "rewards/reasoning_steps_reward/mean": 0.9322916865348816,
      "rewards/reasoning_steps_reward/std": 0.17446180433034897,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 251
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 704.75,
      "completions/max_terminated_length": 704.75,
      "completions/mean_length": 485.328125,
      "completions/mean_terminated_length": 485.328125,
      "completions/min_length": 288.25,
      "completions/min_terminated_length": 288.25,
      "epoch": 0.084,
      "grad_norm": 0.8013951778411865,
      "kl": 0.16162109375,
      "learning_rate": 1.6733333333333335e-05,
      "loss": -0.0377,
      "num_tokens": 12416351.0,
      "reward": 1.0147216022014618,
      "reward_std": 0.204719758592546,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.03295077010989189,
      "rewards/penalized_accuracy_reward/std": 0.13180309534072876,
      "rewards/reasoning_steps_reward/mean": 0.9635416716337204,
      "rewards/reasoning_steps_reward/std": 0.1458333320915699,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 252
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 783.5,
      "completions/max_terminated_length": 783.5,
      "completions/mean_length": 513.40625,
      "completions/mean_terminated_length": 513.40625,
      "completions/min_length": 274.5,
      "completions/min_terminated_length": 274.5,
      "epoch": 0.08433333333333333,
      "grad_norm": 0.12178342789411545,
      "kl": 0.175048828125,
      "learning_rate": 1.6800000000000002e-05,
      "loss": 0.007,
      "num_tokens": 12462121.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 253
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 851.75,
      "completions/max_terminated_length": 851.75,
      "completions/mean_length": 581.484375,
      "completions/mean_terminated_length": 581.484375,
      "completions/min_length": 353.5,
      "completions/min_terminated_length": 353.5,
      "epoch": 0.08466666666666667,
      "grad_norm": 0.6153361797332764,
      "kl": 0.2001953125,
      "learning_rate": 1.686666666666667e-05,
      "loss": 0.0297,
      "num_tokens": 12508840.0,
      "reward": 1.1047951132059097,
      "reward_std": 0.2661496289074421,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.11143571883440018,
      "rewards/penalized_accuracy_reward/std": 0.23958711326122284,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 254
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 811.25,
      "completions/max_terminated_length": 811.25,
      "completions/mean_length": 496.40625,
      "completions/mean_terminated_length": 496.40625,
      "completions/min_length": 261.75,
      "completions/min_terminated_length": 261.75,
      "epoch": 0.085,
      "grad_norm": 0.6173115372657776,
      "kl": 0.173095703125,
      "learning_rate": 1.6933333333333336e-05,
      "loss": -0.0366,
      "num_tokens": 12548482.0,
      "reward": 1.1505606770515442,
      "reward_std": 0.35186809953302145,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.1531648486852646,
      "rewards/penalized_accuracy_reward/std": 0.3414514288306236,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 255
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 615.75,
      "completions/max_terminated_length": 615.75,
      "completions/mean_length": 392.28125,
      "completions/mean_terminated_length": 392.28125,
      "completions/min_length": 216.25,
      "completions/min_terminated_length": 216.25,
      "epoch": 0.08533333333333333,
      "grad_norm": 0.7294853329658508,
      "kl": 0.1728515625,
      "learning_rate": 1.7e-05,
      "loss": -0.0149,
      "num_tokens": 12582468.0,
      "reward": 1.2870292961597443,
      "reward_std": 0.31009191926568747,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.2896333932876587,
      "rewards/penalized_accuracy_reward/std": 0.29967525601387024,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 256
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 762.5,
      "completions/max_terminated_length": 762.5,
      "completions/mean_length": 523.546875,
      "completions/mean_terminated_length": 523.546875,
      "completions/min_length": 266.0,
      "completions/min_terminated_length": 266.0,
      "epoch": 0.08566666666666667,
      "grad_norm": 0.32328730821609497,
      "kl": 0.1884765625,
      "learning_rate": 1.706666666666667e-05,
      "loss": 0.0309,
      "num_tokens": 12629335.0,
      "reward": 1.0314277112483978,
      "reward_std": 0.1257108747959137,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.031427718698978424,
      "rewards/penalized_accuracy_reward/std": 0.1257108747959137,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 257
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 774.25,
      "completions/max_terminated_length": 746.5,
      "completions/mean_length": 553.3125,
      "completions/mean_terminated_length": 548.5385437011719,
      "completions/min_length": 315.0,
      "completions/min_terminated_length": 315.0,
      "epoch": 0.086,
      "grad_norm": 0.8301519155502319,
      "kl": 0.17578125,
      "learning_rate": 1.7133333333333334e-05,
      "loss": 0.0291,
      "num_tokens": 12675451.0,
      "reward": 1.024748146533966,
      "reward_std": 0.15378290473017842,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.03438352793455124,
      "rewards/penalized_accuracy_reward/std": 0.13753412663936615,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 258
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 757.5,
      "completions/max_terminated_length": 722.75,
      "completions/mean_length": 498.90625,
      "completions/mean_terminated_length": 493.2687530517578,
      "completions/min_length": 303.75,
      "completions/min_terminated_length": 303.75,
      "epoch": 0.08633333333333333,
      "grad_norm": 0.6008172631263733,
      "kl": 0.219970703125,
      "learning_rate": 1.72e-05,
      "loss": 0.01,
      "num_tokens": 12717413.0,
      "reward": 1.2428324222564697,
      "reward_std": 0.4229099154472351,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.2520771995186806,
      "rewards/penalized_accuracy_reward/std": 0.41564619541168213,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 259
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 803.25,
      "completions/max_terminated_length": 803.25,
      "completions/mean_length": 598.71875,
      "completions/mean_terminated_length": 598.71875,
      "completions/min_length": 417.75,
      "completions/min_terminated_length": 417.75,
      "epoch": 0.08666666666666667,
      "grad_norm": 0.42760568857192993,
      "kl": 0.21142578125,
      "learning_rate": 1.726666666666667e-05,
      "loss": 0.0179,
      "num_tokens": 12765491.0,
      "reward": 1.0692082047462463,
      "reward_std": 0.2115136981010437,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.07584881037473679,
      "rewards/penalized_accuracy_reward/std": 0.20726299285888672,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 260
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 796.0,
      "completions/max_terminated_length": 796.0,
      "completions/mean_length": 563.328125,
      "completions/mean_terminated_length": 563.328125,
      "completions/min_length": 325.5,
      "completions/min_terminated_length": 325.5,
      "epoch": 0.087,
      "grad_norm": 0.5432370901107788,
      "kl": 0.1748046875,
      "learning_rate": 1.7333333333333336e-05,
      "loss": 0.0149,
      "num_tokens": 12810776.0,
      "reward": 0.9929687529802322,
      "reward_std": 0.02812499925494194,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 261
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 882.75,
      "completions/max_terminated_length": 880.25,
      "completions/mean_length": 624.921875,
      "completions/mean_terminated_length": 619.2250061035156,
      "completions/min_length": 424.75,
      "completions/min_terminated_length": 424.75,
      "epoch": 0.08733333333333333,
      "grad_norm": 0.61566561460495,
      "kl": 0.185791015625,
      "learning_rate": 1.7400000000000003e-05,
      "loss": 0.0556,
      "num_tokens": 12859571.0,
      "reward": 1.3334292024374008,
      "reward_std": 0.586882371455431,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.34006984531879425,
      "rewards/penalized_accuracy_reward/std": 0.5603198409080505,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 262
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 780.5,
      "completions/max_terminated_length": 780.5,
      "completions/mean_length": 567.390625,
      "completions/mean_terminated_length": 567.390625,
      "completions/min_length": 351.0,
      "completions/min_terminated_length": 351.0,
      "epoch": 0.08766666666666667,
      "grad_norm": 0.4211113154888153,
      "kl": 0.180419921875,
      "learning_rate": 1.7466666666666667e-05,
      "loss": 0.0048,
      "num_tokens": 12904428.0,
      "reward": 0.9907552152872086,
      "reward_std": 0.036979163996875286,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 263
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 724.25,
      "completions/max_terminated_length": 714.5,
      "completions/mean_length": 522.640625,
      "completions/mean_terminated_length": 516.8020935058594,
      "completions/min_length": 365.25,
      "completions/min_terminated_length": 365.25,
      "epoch": 0.088,
      "grad_norm": 0.3019135296344757,
      "kl": 0.176513671875,
      "learning_rate": 1.7533333333333337e-05,
      "loss": 0.0236,
      "num_tokens": 12948645.0,
      "reward": 1.027706801891327,
      "reward_std": 0.1416618824005127,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.03434742987155914,
      "rewards/penalized_accuracy_reward/std": 0.13738973438739777,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 264
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 913.75,
      "completions/max_terminated_length": 875.0,
      "completions/mean_length": 612.90625,
      "completions/mean_terminated_length": 607.7541809082031,
      "completions/min_length": 429.0,
      "completions/min_terminated_length": 429.0,
      "epoch": 0.08833333333333333,
      "grad_norm": 0.47055670619010925,
      "kl": 0.1806640625,
      "learning_rate": 1.76e-05,
      "loss": 0.0249,
      "num_tokens": 12999887.0,
      "reward": 1.103381261229515,
      "reward_std": 0.2237912304699421,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.11002188175916672,
      "rewards/penalized_accuracy_reward/std": 0.19722871482372284,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 265
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 803.5,
      "completions/max_terminated_length": 803.5,
      "completions/mean_length": 576.75,
      "completions/mean_terminated_length": 576.75,
      "completions/min_length": 412.5,
      "completions/min_terminated_length": 412.5,
      "epoch": 0.08866666666666667,
      "grad_norm": 0.40757638216018677,
      "kl": 0.2119140625,
      "learning_rate": 1.7666666666666668e-05,
      "loss": -0.0054,
      "num_tokens": 13047551.0,
      "reward": 0.9933593720197678,
      "reward_std": 0.02656250074505806,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 266
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 890.25,
      "completions/max_terminated_length": 780.5,
      "completions/mean_length": 595.78125,
      "completions/mean_terminated_length": 581.792724609375,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 0.089,
      "grad_norm": 0.6589139103889465,
      "kl": 0.158447265625,
      "learning_rate": 1.7733333333333335e-05,
      "loss": 0.0656,
      "num_tokens": 13095889.0,
      "reward": 1.192888155579567,
      "reward_std": 0.20832654368132353,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.2225756049156189,
      "rewards/penalized_accuracy_reward/std": 0.1634555160999298,
      "rewards/reasoning_steps_reward/mean": 0.9687500149011612,
      "rewards/reasoning_steps_reward/std": 0.0936010368168354,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.0625,
      "step": 267
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 779.0,
      "completions/max_terminated_length": 779.0,
      "completions/mean_length": 513.046875,
      "completions/mean_terminated_length": 513.046875,
      "completions/min_length": 287.75,
      "completions/min_terminated_length": 287.75,
      "epoch": 0.08933333333333333,
      "grad_norm": 0.5512727499008179,
      "kl": 0.167236328125,
      "learning_rate": 1.7800000000000002e-05,
      "loss": -0.0193,
      "num_tokens": 13145492.0,
      "reward": 1.1362604349851608,
      "reward_std": 0.3192702382802963,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.14928126335144043,
      "rewards/penalized_accuracy_reward/std": 0.2671869397163391,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.1041666641831398,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 268
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 777.25,
      "completions/max_terminated_length": 777.25,
      "completions/mean_length": 584.609375,
      "completions/mean_terminated_length": 584.609375,
      "completions/min_length": 351.75,
      "completions/min_terminated_length": 351.75,
      "epoch": 0.08966666666666667,
      "grad_norm": 0.5141566395759583,
      "kl": 0.194580078125,
      "learning_rate": 1.7866666666666666e-05,
      "loss": -0.0297,
      "num_tokens": 13196363.0,
      "reward": 0.9873698055744171,
      "reward_std": 0.03819324728101492,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.033994100987911224,
      "step": 269
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 831.0,
      "completions/max_terminated_length": 750.0,
      "completions/mean_length": 570.375,
      "completions/mean_terminated_length": 561.8895874023438,
      "completions/min_length": 363.25,
      "completions/min_terminated_length": 363.25,
      "epoch": 0.09,
      "grad_norm": 0.4032162129878998,
      "kl": 0.18017578125,
      "learning_rate": 1.7933333333333333e-05,
      "loss": -0.0049,
      "num_tokens": 13244691.0,
      "reward": 1.3360155820846558,
      "reward_std": 0.20632131397724152,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.3464322090148926,
      "rewards/penalized_accuracy_reward/std": 0.20845253765583038,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.0625,
      "step": 270
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 883.0,
      "completions/max_terminated_length": 798.75,
      "completions/mean_length": 498.546875,
      "completions/mean_terminated_length": 490.7093811035156,
      "completions/min_length": 308.5,
      "completions/min_terminated_length": 308.5,
      "epoch": 0.09033333333333333,
      "grad_norm": 0.7402020692825317,
      "kl": 0.1796875,
      "learning_rate": 1.8e-05,
      "loss": 0.0827,
      "num_tokens": 13286470.0,
      "reward": 1.0911231935024261,
      "reward_std": 0.22846381599083543,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.10049816966056824,
      "rewards/penalized_accuracy_reward/std": 0.21616530418395996,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.96875,
      "rewards/tag_count_reward/std": 0.125,
      "step": 271
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 751.75,
      "completions/max_terminated_length": 751.75,
      "completions/mean_length": 562.796875,
      "completions/mean_terminated_length": 562.796875,
      "completions/min_length": 408.75,
      "completions/min_terminated_length": 408.75,
      "epoch": 0.09066666666666667,
      "grad_norm": 0.3422425091266632,
      "kl": 0.206298828125,
      "learning_rate": 1.8066666666666668e-05,
      "loss": 0.0166,
      "num_tokens": 13336505.0,
      "reward": 1.0235688090324402,
      "reward_std": 0.1415824592113495,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0339854471385479,
      "rewards/penalized_accuracy_reward/std": 0.13594180345535278,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.05692750960588455,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 272
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 767.5,
      "completions/max_terminated_length": 767.5,
      "completions/mean_length": 544.65625,
      "completions/mean_terminated_length": 544.65625,
      "completions/min_length": 337.75,
      "completions/min_terminated_length": 337.75,
      "epoch": 0.091,
      "grad_norm": 0.23651820421218872,
      "kl": 0.1640625,
      "learning_rate": 1.8133333333333335e-05,
      "loss": -0.0169,
      "num_tokens": 13380803.0,
      "reward": 0.9903645813465118,
      "reward_std": 0.02933359704911709,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 273
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 786.75,
      "completions/max_terminated_length": 786.75,
      "completions/mean_length": 498.234375,
      "completions/mean_terminated_length": 498.234375,
      "completions/min_length": 277.0,
      "completions/min_terminated_length": 277.0,
      "epoch": 0.09133333333333334,
      "grad_norm": 0.6377527117729187,
      "kl": 0.165283203125,
      "learning_rate": 1.8200000000000002e-05,
      "loss": -0.0266,
      "num_tokens": 13425106.0,
      "reward": 1.054260477423668,
      "reward_std": 0.27288868278265,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.07652609050273895,
      "rewards/penalized_accuracy_reward/std": 0.2091090977191925,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.045325469225645065,
      "rewards/tag_count_reward/mean": 0.98046875,
      "rewards/tag_count_reward/std": 0.078125,
      "step": 274
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 776.0,
      "completions/max_terminated_length": 776.0,
      "completions/mean_length": 485.234375,
      "completions/mean_terminated_length": 485.234375,
      "completions/min_length": 280.0,
      "completions/min_terminated_length": 280.0,
      "epoch": 0.09166666666666666,
      "grad_norm": 0.6798899173736572,
      "kl": 0.169189453125,
      "learning_rate": 1.826666666666667e-05,
      "loss": -0.0909,
      "num_tokens": 13464209.0,
      "reward": 0.9557291865348816,
      "reward_std": 0.11883590277284384,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.911458358168602,
      "rewards/reasoning_steps_reward/std": 0.23767182603478432,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 275
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 796.75,
      "completions/max_terminated_length": 796.75,
      "completions/mean_length": 568.765625,
      "completions/mean_terminated_length": 568.765625,
      "completions/min_length": 344.0,
      "completions/min_terminated_length": 344.0,
      "epoch": 0.092,
      "grad_norm": 0.5649985074996948,
      "kl": 0.16259765625,
      "learning_rate": 1.8333333333333333e-05,
      "loss": -0.0531,
      "num_tokens": 13511362.0,
      "reward": 0.9759114682674408,
      "reward_std": 0.09635416325181723,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.0833333320915699,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 276
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 896.25,
      "completions/max_terminated_length": 881.0,
      "completions/mean_length": 666.0625,
      "completions/mean_terminated_length": 661.7218780517578,
      "completions/min_length": 439.0,
      "completions/min_terminated_length": 439.0,
      "epoch": 0.09233333333333334,
      "grad_norm": 0.5934914946556091,
      "kl": 0.18359375,
      "learning_rate": 1.8400000000000003e-05,
      "loss": 0.0345,
      "num_tokens": 13564806.0,
      "reward": 0.9390624910593033,
      "reward_std": 0.11813730373978615,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.27289126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.953125,
      "rewards/tag_count_reward/std": 0.09526265040040016,
      "step": 277
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 801.75,
      "completions/max_terminated_length": 801.75,
      "completions/mean_length": 603.125,
      "completions/mean_terminated_length": 603.125,
      "completions/min_length": 336.0,
      "completions/min_terminated_length": 336.0,
      "epoch": 0.09266666666666666,
      "grad_norm": 0.8149129152297974,
      "kl": 0.19921875,
      "learning_rate": 1.8466666666666667e-05,
      "loss": 0.0618,
      "num_tokens": 13612222.0,
      "reward": 0.9855468720197678,
      "reward_std": 0.057812500395812094,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.98046875,
      "rewards/tag_count_reward/std": 0.078125,
      "step": 278
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 900.5,
      "completions/max_terminated_length": 885.5,
      "completions/mean_length": 724.78125,
      "completions/mean_terminated_length": 717.1986694335938,
      "completions/min_length": 517.25,
      "completions/min_terminated_length": 517.25,
      "epoch": 0.093,
      "grad_norm": 0.3915879726409912,
      "kl": 0.1728515625,
      "learning_rate": 1.8533333333333334e-05,
      "loss": 0.018,
      "num_tokens": 13667648.0,
      "reward": 0.9841145873069763,
      "reward_std": 0.0467079458758235,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.08539126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.021347815170884132,
      "step": 279
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 907.75,
      "completions/max_terminated_length": 887.75,
      "completions/mean_length": 635.84375,
      "completions/mean_terminated_length": 630.9354248046875,
      "completions/min_length": 325.5,
      "completions/min_terminated_length": 325.5,
      "epoch": 0.09333333333333334,
      "grad_norm": 0.4796634614467621,
      "kl": 0.1650390625,
      "learning_rate": 1.86e-05,
      "loss": 0.0293,
      "num_tokens": 13720150.0,
      "reward": 0.9566406309604645,
      "reward_std": 0.09092025086283684,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.1971946656703949,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.94140625,
      "rewards/tag_count_reward/std": 0.1265372931957245,
      "step": 280
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 839.75,
      "completions/max_terminated_length": 807.0,
      "completions/mean_length": 557.234375,
      "completions/mean_terminated_length": 547.1428680419922,
      "completions/min_length": 304.5,
      "completions/min_terminated_length": 304.5,
      "epoch": 0.09366666666666666,
      "grad_norm": 0.6499218940734863,
      "kl": 0.19873046875,
      "learning_rate": 1.866666666666667e-05,
      "loss": -0.005,
      "num_tokens": 13764053.0,
      "reward": 1.1594230234622955,
      "reward_std": 0.308895755559206,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.10077822208404541,
      "rewards/penalized_accuracy_reward/mean": 0.2009594738483429,
      "rewards/penalized_accuracy_reward/std": 0.20988135039806366,
      "rewards/reasoning_steps_reward/mean": 0.9583333432674408,
      "rewards/reasoning_steps_reward/std": 0.10885214060544968,
      "rewards/tag_count_reward/mean": 0.98046875,
      "rewards/tag_count_reward/std": 0.049575019627809525,
      "step": 281
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 823.75,
      "completions/max_terminated_length": 815.0,
      "completions/mean_length": 603.921875,
      "completions/mean_terminated_length": 598.4656372070312,
      "completions/min_length": 353.5,
      "completions/min_terminated_length": 353.5,
      "epoch": 0.094,
      "grad_norm": 0.4768775403499603,
      "kl": 0.1806640625,
      "learning_rate": 1.8733333333333336e-05,
      "loss": 0.0299,
      "num_tokens": 13813200.0,
      "reward": 1.022420957684517,
      "reward_std": 0.18214058130979538,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.037915751338005066,
      "rewards/penalized_accuracy_reward/std": 0.15166300535202026,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 282
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 810.25,
      "completions/max_terminated_length": 810.25,
      "completions/mean_length": 616.890625,
      "completions/mean_terminated_length": 616.890625,
      "completions/min_length": 406.25,
      "completions/min_terminated_length": 406.25,
      "epoch": 0.09433333333333334,
      "grad_norm": 0.4238940179347992,
      "kl": 0.180419921875,
      "learning_rate": 1.88e-05,
      "loss": -0.0062,
      "num_tokens": 13862537.0,
      "reward": 0.990364596247673,
      "reward_std": 0.03854166250675917,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 283
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 802.75,
      "completions/max_terminated_length": 802.75,
      "completions/mean_length": 610.5,
      "completions/mean_terminated_length": 610.5,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 0.09466666666666666,
      "grad_norm": 0.3470335900783539,
      "kl": 0.1865234375,
      "learning_rate": 1.886666666666667e-05,
      "loss": 0.0145,
      "num_tokens": 13910329.0,
      "reward": 0.9973958432674408,
      "reward_std": 0.010416663251817226,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 284
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 790.5,
      "completions/max_terminated_length": 790.5,
      "completions/mean_length": 569.078125,
      "completions/mean_terminated_length": 569.078125,
      "completions/min_length": 363.25,
      "completions/min_terminated_length": 363.25,
      "epoch": 0.095,
      "grad_norm": 0.4298518896102905,
      "kl": 0.1962890625,
      "learning_rate": 1.8933333333333334e-05,
      "loss": 0.0273,
      "num_tokens": 13955998.0,
      "reward": 0.9911458343267441,
      "reward_std": 0.026434535160660744,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 285
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 746.5,
      "completions/max_terminated_length": 746.5,
      "completions/mean_length": 500.328125,
      "completions/mean_terminated_length": 500.328125,
      "completions/min_length": 272.25,
      "completions/min_terminated_length": 272.25,
      "epoch": 0.09533333333333334,
      "grad_norm": 0.36656492948532104,
      "kl": 0.195556640625,
      "learning_rate": 1.9e-05,
      "loss": 0.0052,
      "num_tokens": 13999267.0,
      "reward": 1.035222053527832,
      "reward_std": 0.1523541957139969,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.03782619908452034,
      "rewards/penalized_accuracy_reward/std": 0.15130481123924255,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 286
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 717.0,
      "completions/max_terminated_length": 717.0,
      "completions/mean_length": 480.203125,
      "completions/mean_terminated_length": 480.203125,
      "completions/min_length": 238.75,
      "completions/min_terminated_length": 238.75,
      "epoch": 0.09566666666666666,
      "grad_norm": 0.7211626172065735,
      "kl": 0.20703125,
      "learning_rate": 1.9066666666666668e-05,
      "loss": -0.0091,
      "num_tokens": 14044032.0,
      "reward": 0.9789062440395355,
      "reward_std": 0.08437500149011612,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.0625,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 287
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 568.0,
      "completions/max_terminated_length": 568.0,
      "completions/mean_length": 398.984375,
      "completions/mean_terminated_length": 398.984375,
      "completions/min_length": 206.5,
      "completions/min_terminated_length": 206.5,
      "epoch": 0.096,
      "grad_norm": 0.7898841500282288,
      "kl": 0.2197265625,
      "learning_rate": 1.9133333333333335e-05,
      "loss": -0.068,
      "num_tokens": 14079407.0,
      "reward": 0.9791666865348816,
      "reward_std": 0.06615879014134407,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9583333432674408,
      "rewards/reasoning_steps_reward/std": 0.13231760263442993,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 288
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 661.25,
      "completions/max_terminated_length": 661.25,
      "completions/mean_length": 460.09375,
      "completions/mean_terminated_length": 460.09375,
      "completions/min_length": 265.5,
      "completions/min_terminated_length": 265.5,
      "epoch": 0.09633333333333334,
      "grad_norm": 0.4932043254375458,
      "kl": 0.185791015625,
      "learning_rate": 1.9200000000000003e-05,
      "loss": -0.0723,
      "num_tokens": 14120565.0,
      "reward": 1.018554836511612,
      "reward_std": 0.13671931624412537,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.02636732906103134,
      "rewards/penalized_accuracy_reward/std": 0.10546931624412537,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.0625,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 289
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 639.75,
      "completions/max_terminated_length": 639.75,
      "completions/mean_length": 482.90625,
      "completions/mean_terminated_length": 482.90625,
      "completions/min_length": 266.25,
      "completions/min_terminated_length": 266.25,
      "epoch": 0.09666666666666666,
      "grad_norm": 0.1181245893239975,
      "kl": 0.24072265625,
      "learning_rate": 1.926666666666667e-05,
      "loss": 0.0096,
      "num_tokens": 14161055.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 290
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 711.25,
      "completions/max_terminated_length": 711.25,
      "completions/mean_length": 523.328125,
      "completions/mean_terminated_length": 523.328125,
      "completions/min_length": 346.0,
      "completions/min_terminated_length": 346.0,
      "epoch": 0.097,
      "grad_norm": 0.3490356504917145,
      "kl": 0.21484375,
      "learning_rate": 1.9333333333333333e-05,
      "loss": 0.0178,
      "num_tokens": 14204964.0,
      "reward": 1.052797555923462,
      "reward_std": 0.144706130027771,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.05279757082462311,
      "rewards/penalized_accuracy_reward/std": 0.1447061449289322,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 291
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 787.75,
      "completions/max_terminated_length": 787.75,
      "completions/mean_length": 582.15625,
      "completions/mean_terminated_length": 582.15625,
      "completions/min_length": 410.0,
      "completions/min_terminated_length": 410.0,
      "epoch": 0.09733333333333333,
      "grad_norm": 0.4878210723400116,
      "kl": 0.20556640625,
      "learning_rate": 1.94e-05,
      "loss": 0.0187,
      "num_tokens": 14253038.0,
      "reward": 1.0754664540290833,
      "reward_std": 0.28016388416290283,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.08210709318518639,
      "rewards/penalized_accuracy_reward/std": 0.27484729140996933,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 292
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 706.75,
      "completions/max_terminated_length": 706.75,
      "completions/mean_length": 535.59375,
      "completions/mean_terminated_length": 535.59375,
      "completions/min_length": 408.75,
      "completions/min_terminated_length": 408.75,
      "epoch": 0.09766666666666667,
      "grad_norm": 0.5533336400985718,
      "kl": 0.20703125,
      "learning_rate": 1.9466666666666668e-05,
      "loss": 0.0085,
      "num_tokens": 14296564.0,
      "reward": 1.1724434942007065,
      "reward_std": 0.27161475270986557,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.18168824911117554,
      "rewards/penalized_accuracy_reward/std": 0.24373677372932434,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 293
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 692.75,
      "completions/max_terminated_length": 692.75,
      "completions/mean_length": 539.421875,
      "completions/mean_terminated_length": 539.421875,
      "completions/min_length": 375.5,
      "completions/min_terminated_length": 375.5,
      "epoch": 0.098,
      "grad_norm": 0.14699687063694,
      "kl": 0.2314453125,
      "learning_rate": 1.9533333333333335e-05,
      "loss": 0.0093,
      "num_tokens": 14340399.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 294
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 831.5,
      "completions/max_terminated_length": 831.5,
      "completions/mean_length": 598.734375,
      "completions/mean_terminated_length": 598.734375,
      "completions/min_length": 421.0,
      "completions/min_terminated_length": 421.0,
      "epoch": 0.09833333333333333,
      "grad_norm": 0.5051561594009399,
      "kl": 0.232177734375,
      "learning_rate": 1.9600000000000002e-05,
      "loss": 0.0006,
      "num_tokens": 14387806.0,
      "reward": 0.9841145873069763,
      "reward_std": 0.0467079458758235,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.08539126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.9921875,
      "rewards/tag_count_reward/std": 0.021347815170884132,
      "step": 295
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 937.75,
      "completions/max_terminated_length": 937.75,
      "completions/mean_length": 716.828125,
      "completions/mean_terminated_length": 716.828125,
      "completions/min_length": 512.75,
      "completions/min_terminated_length": 512.75,
      "epoch": 0.09866666666666667,
      "grad_norm": 0.33059147000312805,
      "kl": 0.159423828125,
      "learning_rate": 1.9666666666666666e-05,
      "loss": 0.002,
      "num_tokens": 14444563.0,
      "reward": 0.9973958432674408,
      "reward_std": 0.010416663251817226,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 296
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 911.75,
      "completions/max_terminated_length": 848.5,
      "completions/mean_length": 708.296875,
      "completions/mean_terminated_length": 702.8958435058594,
      "completions/min_length": 485.5,
      "completions/min_terminated_length": 485.5,
      "epoch": 0.099,
      "grad_norm": 0.6231974363327026,
      "kl": 0.19970703125,
      "learning_rate": 1.9733333333333336e-05,
      "loss": 0.0085,
      "num_tokens": 14499014.0,
      "reward": 0.927083358168602,
      "reward_std": 0.15352921932935715,
      "rewards/format_reward/mean": 0.859375,
      "rewards/format_reward/std": 0.2979728877544403,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9791666865348816,
      "rewards/reasoning_steps_reward/std": 0.0833333283662796,
      "rewards/tag_count_reward/mean": 0.9375,
      "rewards/tag_count_reward/std": 0.13525452837347984,
      "step": 297
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 899.0,
      "completions/max_terminated_length": 858.5,
      "completions/mean_length": 623.4375,
      "completions/mean_terminated_length": 616.4937591552734,
      "completions/min_length": 419.0,
      "completions/min_terminated_length": 419.0,
      "epoch": 0.09933333333333333,
      "grad_norm": 1.3560590744018555,
      "kl": 0.213134765625,
      "learning_rate": 1.98e-05,
      "loss": -0.0724,
      "num_tokens": 14548466.0,
      "reward": 0.6550781428813934,
      "reward_std": 0.18048088252544403,
      "rewards/format_reward/mean": 0.234375,
      "rewards/format_reward/std": 0.4006601870059967,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.61328125,
      "rewards/tag_count_reward/std": 0.2045094631612301,
      "step": 298
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 873.5,
      "completions/max_terminated_length": 873.5,
      "completions/mean_length": 669.78125,
      "completions/mean_terminated_length": 669.78125,
      "completions/min_length": 468.75,
      "completions/min_terminated_length": 468.75,
      "epoch": 0.09966666666666667,
      "grad_norm": 0.15527978539466858,
      "kl": 0.196044921875,
      "learning_rate": 1.9866666666666667e-05,
      "loss": 0.0078,
      "num_tokens": 14603700.0,
      "reward": 1.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 299
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 991.0,
      "completions/mean_length": 869.875,
      "completions/mean_terminated_length": 833.1428833007812,
      "completions/min_length": 634.25,
      "completions/min_terminated_length": 634.25,
      "epoch": 0.1,
      "grad_norm": 0.5940057635307312,
      "kl": 0.183349609375,
      "learning_rate": 1.9933333333333334e-05,
      "loss": 0.0858,
      "num_tokens": 14673020.0,
      "reward": 0.919921875,
      "reward_std": 0.16870027035474777,
      "rewards/format_reward/mean": 0.8125,
      "rewards/format_reward/std": 0.3943893313407898,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.94921875,
      "rewards/tag_count_reward/std": 0.11124361865222454,
      "step": 300
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 906.0,
      "completions/max_terminated_length": 906.0,
      "completions/mean_length": 718.484375,
      "completions/mean_terminated_length": 718.484375,
      "completions/min_length": 521.25,
      "completions/min_terminated_length": 521.25,
      "epoch": 0.10033333333333333,
      "grad_norm": 0.4100304841995239,
      "kl": 0.172119140625,
      "learning_rate": 2e-05,
      "loss": 0.0099,
      "num_tokens": 14731275.0,
      "reward": 1.0655567944049835,
      "reward_std": 0.2622271999716759,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.06555680371820927,
      "rewards/penalized_accuracy_reward/std": 0.26222722977399826,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 301
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 975.25,
      "completions/max_terminated_length": 947.75,
      "completions/mean_length": 798.65625,
      "completions/mean_terminated_length": 758.5296630859375,
      "completions/min_length": 491.5,
      "completions/min_terminated_length": 491.5,
      "epoch": 0.10066666666666667,
      "grad_norm": 0.6387519836425781,
      "kl": 0.197509765625,
      "learning_rate": 1.999999323072477e-05,
      "loss": 0.0339,
      "num_tokens": 14793445.0,
      "reward": 0.7894531488418579,
      "reward_std": 0.2002791464328766,
      "rewards/format_reward/mean": 0.59375,
      "rewards/format_reward/std": 0.46449070423841476,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.51953125,
      "rewards/tag_count_reward/std": 0.2541043721139431,
      "step": 302
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 942.25,
      "completions/max_terminated_length": 878.5,
      "completions/mean_length": 685.21875,
      "completions/mean_terminated_length": 645.7833557128906,
      "completions/min_length": 409.25,
      "completions/min_terminated_length": 409.25,
      "epoch": 0.101,
      "grad_norm": 0.7488775849342346,
      "kl": 0.199951171875,
      "learning_rate": 1.999997292290824e-05,
      "loss": 0.166,
      "num_tokens": 14846803.0,
      "reward": 0.9568192660808563,
      "reward_std": 0.272556833922863,
      "rewards/format_reward/mean": 0.8125,
      "rewards/format_reward/std": 0.29237766563892365,
      "rewards/penalized_accuracy_reward/mean": 0.05642864480614662,
      "rewards/penalized_accuracy_reward/std": 0.15471996366977692,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.75390625,
      "rewards/tag_count_reward/std": 0.3052690625190735,
      "step": 303
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 793.25,
      "completions/max_terminated_length": 793.25,
      "completions/mean_length": 533.421875,
      "completions/mean_terminated_length": 533.421875,
      "completions/min_length": 355.0,
      "completions/min_terminated_length": 355.0,
      "epoch": 0.10133333333333333,
      "grad_norm": 0.760302722454071,
      "kl": 0.222900390625,
      "learning_rate": 1.9999939076577906e-05,
      "loss": 0.0037,
      "num_tokens": 14893182.0,
      "reward": 1.0884817838668823,
      "reward_std": 0.4543069154024124,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.10671091079711914,
      "rewards/penalized_accuracy_reward/std": 0.42684365808963776,
      "rewards/reasoning_steps_reward/mean": 0.9635416716337204,
      "rewards/reasoning_steps_reward/std": 0.11148427054286003,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 304
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 756.5,
      "completions/max_terminated_length": 756.5,
      "completions/mean_length": 585.265625,
      "completions/mean_terminated_length": 585.265625,
      "completions/min_length": 422.0,
      "completions/min_terminated_length": 422.0,
      "epoch": 0.10166666666666667,
      "grad_norm": 0.5774600505828857,
      "kl": 0.211669921875,
      "learning_rate": 1.999989169177959e-05,
      "loss": -0.0124,
      "num_tokens": 14937551.0,
      "reward": 0.9802083224058151,
      "reward_std": 0.06704528210684657,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9635416716337204,
      "rewards/reasoning_steps_reward/std": 0.12704972177743912,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.042695630341768265,
      "step": 305
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 773.0,
      "completions/max_terminated_length": 773.0,
      "completions/mean_length": 473.1875,
      "completions/mean_terminated_length": 473.1875,
      "completions/min_length": 286.75,
      "completions/min_terminated_length": 286.75,
      "epoch": 0.102,
      "grad_norm": 0.7446598410606384,
      "kl": 0.22705078125,
      "learning_rate": 1.9999830768577445e-05,
      "loss": 0.0203,
      "num_tokens": 14978139.0,
      "reward": 1.1352150440216064,
      "reward_std": 0.26945349760353565,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.19732439517974854,
      "rewards/penalized_accuracy_reward/std": 0.18291671574115753,
      "rewards/reasoning_steps_reward/mean": 0.90625,
      "rewards/reasoning_steps_reward/std": 0.18649740889668465,
      "rewards/tag_count_reward/mean": 0.97265625,
      "rewards/tag_count_reward/std": 0.07668973132967949,
      "step": 306
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 740.25,
      "completions/max_terminated_length": 740.25,
      "completions/mean_length": 507.578125,
      "completions/mean_terminated_length": 507.578125,
      "completions/min_length": 329.25,
      "completions/min_terminated_length": 329.25,
      "epoch": 0.10233333333333333,
      "grad_norm": 0.6429900527000427,
      "kl": 0.20166015625,
      "learning_rate": 1.9999756307053947e-05,
      "loss": -0.0358,
      "num_tokens": 15020304.0,
      "reward": 1.074374109506607,
      "reward_std": 0.2731225108727813,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.10419180989265442,
      "rewards/penalized_accuracy_reward/std": 0.22402693331241608,
      "rewards/reasoning_steps_reward/mean": 0.9427083432674408,
      "rewards/reasoning_steps_reward/std": 0.13398722559213638,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.03697281517088413,
      "step": 307
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 731.25,
      "completions/max_terminated_length": 692.75,
      "completions/mean_length": 516.828125,
      "completions/mean_terminated_length": 509.7354278564453,
      "completions/min_length": 339.0,
      "completions/min_terminated_length": 339.0,
      "epoch": 0.10266666666666667,
      "grad_norm": 0.7568418979644775,
      "kl": 0.234619140625,
      "learning_rate": 1.999966830730992e-05,
      "loss": 0.0749,
      "num_tokens": 15062277.0,
      "reward": 0.9467447847127914,
      "reward_std": 0.11273385118693113,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9114583432674408,
      "rewards/reasoning_steps_reward/std": 0.18772627413272858,
      "rewards/tag_count_reward/mean": 0.97265625,
      "rewards/tag_count_reward/std": 0.08538510836660862,
      "step": 308
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 881.25,
      "completions/max_terminated_length": 881.25,
      "completions/mean_length": 571.71875,
      "completions/mean_terminated_length": 571.71875,
      "completions/min_length": 352.25,
      "completions/min_terminated_length": 352.25,
      "epoch": 0.103,
      "grad_norm": 1.1642966270446777,
      "kl": 0.25732421875,
      "learning_rate": 1.9999566769464483e-05,
      "loss": 0.0204,
      "num_tokens": 15111011.0,
      "reward": 0.9882493168115616,
      "reward_std": 0.2335687279701233,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.03486389294266701,
      "rewards/penalized_accuracy_reward/std": 0.13945557177066803,
      "rewards/reasoning_steps_reward/mean": 0.942708358168602,
      "rewards/reasoning_steps_reward/std": 0.14845871925354004,
      "rewards/tag_count_reward/mean": 0.9453125,
      "rewards/tag_count_reward/std": 0.11003002151846886,
      "step": 309
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 705.75,
      "completions/max_terminated_length": 705.75,
      "completions/mean_length": 541.015625,
      "completions/mean_terminated_length": 541.015625,
      "completions/min_length": 359.75,
      "completions/min_terminated_length": 359.75,
      "epoch": 0.10333333333333333,
      "grad_norm": 0.6742736101150513,
      "kl": 0.2041015625,
      "learning_rate": 1.9999451693655125e-05,
      "loss": -0.0092,
      "num_tokens": 15156244.0,
      "reward": 0.9901554882526398,
      "reward_std": 0.2408963106572628,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.21039126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.035337790846824646,
      "rewards/penalized_accuracy_reward/std": 0.14135116338729858,
      "rewards/reasoning_steps_reward/mean": 0.9635416865348816,
      "rewards/reasoning_steps_reward/std": 0.08714327588677406,
      "rewards/tag_count_reward/mean": 0.98046875,
      "rewards/tag_count_reward/std": 0.05644455552101135,
      "step": 310
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 792.25,
      "completions/max_terminated_length": 792.25,
      "completions/mean_length": 565.4375,
      "completions/mean_terminated_length": 565.4375,
      "completions/min_length": 398.75,
      "completions/min_terminated_length": 398.75,
      "epoch": 0.10366666666666667,
      "grad_norm": 0.7241008281707764,
      "kl": 0.27734375,
      "learning_rate": 1.9999323080037623e-05,
      "loss": 0.0163,
      "num_tokens": 15203392.0,
      "reward": 1.0042553097009659,
      "reward_std": 0.1819033268839121,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.1875,
      "rewards/penalized_accuracy_reward/mean": 0.02938549779355526,
      "rewards/penalized_accuracy_reward/std": 0.11754199117422104,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.98828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 311
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 802.5,
      "completions/max_terminated_length": 802.5,
      "completions/mean_length": 581.71875,
      "completions/mean_terminated_length": 581.71875,
      "completions/min_length": 346.25,
      "completions/min_terminated_length": 346.25,
      "epoch": 0.104,
      "grad_norm": 0.47812944650650024,
      "kl": 0.269775390625,
      "learning_rate": 1.9999180928786113e-05,
      "loss": 0.006,
      "num_tokens": 15249166.0,
      "reward": 1.1379987299442291,
      "reward_std": 0.26195312198251486,
      "rewards/format_reward/mean": 1.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.14060290157794952,
      "rewards/penalized_accuracy_reward/std": 0.25153645873069763,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 1.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 312
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 862.25,
      "completions/max_terminated_length": 862.25,
      "completions/mean_length": 581.5,
      "completions/mean_terminated_length": 581.5,
      "completions/min_length": 395.5,
      "completions/min_terminated_length": 395.5,
      "epoch": 0.10433333333333333,
      "grad_norm": 0.5108832120895386,
      "kl": 0.306640625,
      "learning_rate": 1.9999025240093045e-05,
      "loss": 0.033,
      "num_tokens": 15295694.0,
      "reward": 1.0145233571529388,
      "reward_std": 0.11746851913630962,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.028585880994796753,
      "rewards/penalized_accuracy_reward/std": 0.11434352397918701,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.0625,
      "step": 313
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 888.25,
      "completions/max_terminated_length": 888.25,
      "completions/mean_length": 619.921875,
      "completions/mean_terminated_length": 619.921875,
      "completions/min_length": 384.25,
      "completions/min_terminated_length": 384.25,
      "epoch": 0.10466666666666667,
      "grad_norm": 0.4307948350906372,
      "kl": 0.276611328125,
      "learning_rate": 1.9998856014169193e-05,
      "loss": 0.0133,
      "num_tokens": 15344585.0,
      "reward": 1.0649471282958984,
      "reward_std": 0.26679350435733795,
      "rewards/format_reward/mean": 0.96875,
      "rewards/format_reward/std": 0.125,
      "rewards/penalized_accuracy_reward/mean": 0.07900964096188545,
      "rewards/penalized_accuracy_reward/std": 0.256163090467453,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.0625,
      "step": 314
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 895.5,
      "completions/max_terminated_length": 895.5,
      "completions/mean_length": 611.375,
      "completions/mean_terminated_length": 611.375,
      "completions/min_length": 342.75,
      "completions/min_terminated_length": 342.75,
      "epoch": 0.105,
      "grad_norm": 0.5434288382530212,
      "kl": 0.29736328125,
      "learning_rate": 1.9998673251243672e-05,
      "loss": 0.0095,
      "num_tokens": 15394177.0,
      "reward": 1.2838045805692673,
      "reward_std": 0.5009002275764942,
      "rewards/format_reward/mean": 0.953125,
      "rewards/format_reward/std": 0.14789126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.3078931048512459,
      "rewards/penalized_accuracy_reward/std": 0.4622473865747452,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.97265625,
      "rewards/tag_count_reward/std": 0.07966844737529755,
      "step": 315
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 950.75,
      "completions/max_terminated_length": 863.75,
      "completions/mean_length": 669.609375,
      "completions/mean_terminated_length": 648.5461578369141,
      "completions/min_length": 464.25,
      "completions/min_terminated_length": 464.25,
      "epoch": 0.10533333333333333,
      "grad_norm": 0.8112999796867371,
      "kl": 0.3037109375,
      "learning_rate": 1.9998476951563914e-05,
      "loss": 0.059,
      "num_tokens": 15450456.0,
      "reward": 0.8890625238418579,
      "reward_std": 0.19580427184700966,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.44091323018074036,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.890625,
      "rewards/tag_count_reward/std": 0.21520674601197243,
      "step": 316
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 716.25,
      "completions/max_terminated_length": 716.25,
      "completions/mean_length": 511.171875,
      "completions/mean_terminated_length": 511.171875,
      "completions/min_length": 332.0,
      "completions/min_terminated_length": 332.0,
      "epoch": 0.10566666666666667,
      "grad_norm": 0.8981077671051025,
      "kl": 0.2529296875,
      "learning_rate": 1.999826711539568e-05,
      "loss": 0.0537,
      "num_tokens": 15492371.0,
      "reward": 0.93831005692482,
      "reward_std": 0.3836033381521702,
      "rewards/format_reward/mean": 0.640625,
      "rewards/format_reward/std": 0.4436737895011902,
      "rewards/penalized_accuracy_reward/mean": 0.09729444235563278,
      "rewards/penalized_accuracy_reward/std": 0.20956267416477203,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.84765625,
      "rewards/tag_count_reward/std": 0.24108554422855377,
      "step": 317
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 931.5,
      "completions/max_terminated_length": 889.0,
      "completions/mean_length": 634.109375,
      "completions/mean_terminated_length": 627.4500122070312,
      "completions/min_length": 356.25,
      "completions/min_terminated_length": 356.25,
      "epoch": 0.106,
      "grad_norm": 0.7437970638275146,
      "kl": 0.23193359375,
      "learning_rate": 1.9998043743023056e-05,
      "loss": 0.0702,
      "num_tokens": 15543850.0,
      "reward": 0.8113281279802322,
      "reward_std": 0.21419205144047737,
      "rewards/format_reward/mean": 0.5625,
      "rewards/format_reward/std": 0.4970766380429268,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.86328125,
      "rewards/tag_count_reward/std": 0.17918536625802517,
      "step": 318
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 684.5,
      "completions/max_terminated_length": 684.5,
      "completions/mean_length": 528.03125,
      "completions/mean_terminated_length": 528.03125,
      "completions/min_length": 383.5,
      "completions/min_terminated_length": 383.5,
      "epoch": 0.10633333333333334,
      "grad_norm": 0.8449734449386597,
      "kl": 0.2451171875,
      "learning_rate": 1.9997806834748455e-05,
      "loss": 0.0074,
      "num_tokens": 15589388.0,
      "reward": 0.9253906309604645,
      "reward_std": 0.15440326184034348,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.35648179799318314,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.94140625,
      "rewards/tag_count_reward/std": 0.1257193200290203,
      "step": 319
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 671.5,
      "completions/max_terminated_length": 671.5,
      "completions/mean_length": 475.796875,
      "completions/mean_terminated_length": 475.796875,
      "completions/min_length": 307.0,
      "completions/min_terminated_length": 307.0,
      "epoch": 0.10666666666666667,
      "grad_norm": 0.7161391377449036,
      "kl": 0.25830078125,
      "learning_rate": 1.9997556390892623e-05,
      "loss": -0.0051,
      "num_tokens": 15634095.0,
      "reward": 1.007447510957718,
      "reward_std": 0.16748694330453873,
      "rewards/format_reward/mean": 0.9375,
      "rewards/format_reward/std": 0.21039126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.034010034054517746,
      "rewards/penalized_accuracy_reward/std": 0.13604013621807098,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.984375,
      "rewards/tag_count_reward/std": 0.05259781517088413,
      "step": 320
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 623.75,
      "completions/max_terminated_length": 623.75,
      "completions/mean_length": 358.078125,
      "completions/mean_terminated_length": 358.078125,
      "completions/min_length": 211.25,
      "completions/min_terminated_length": 211.25,
      "epoch": 0.107,
      "grad_norm": 9.0658597946167,
      "kl": 0.81982421875,
      "learning_rate": 1.999729241179462e-05,
      "loss": 0.1053,
      "num_tokens": 15666116.0,
      "reward": 0.9933593720197678,
      "reward_std": 0.02656250074505806,
      "rewards/format_reward/mean": 0.984375,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.99609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 321
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 721.75,
      "completions/max_terminated_length": 721.75,
      "completions/mean_length": 501.75,
      "completions/mean_terminated_length": 501.75,
      "completions/min_length": 289.75,
      "completions/min_terminated_length": 289.75,
      "epoch": 0.10733333333333334,
      "grad_norm": 0.9017462730407715,
      "kl": 0.28466796875,
      "learning_rate": 1.9997014897811834e-05,
      "loss": 0.0382,
      "num_tokens": 15707316.0,
      "reward": 0.6950409710407257,
      "reward_std": 0.3689886610955,
      "rewards/format_reward/mean": 0.015625,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.12446802854537964,
      "rewards/penalized_accuracy_reward/std": 0.34050996601581573,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.6953125,
      "rewards/tag_count_reward/std": 0.14960849285125732,
      "step": 322
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 727.0,
      "completions/mean_length": 554.4375,
      "completions/mean_terminated_length": 486.5102767944336,
      "completions/min_length": 317.5,
      "completions/min_terminated_length": 317.5,
      "epoch": 0.10766666666666666,
      "grad_norm": 0.8251973390579224,
      "kl": 0.3310546875,
      "learning_rate": 1.9996723849319978e-05,
      "loss": 0.0938,
      "num_tokens": 15752624.0,
      "reward": 0.536979153752327,
      "reward_std": 0.04579480132088065,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.05810113251209259,
      "rewards/tag_count_reward/mean": 0.5,
      "rewards/tag_count_reward/std": 0.25458791851997375,
      "step": 323
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 950.75,
      "completions/max_terminated_length": 822.25,
      "completions/mean_length": 559.09375,
      "completions/mean_terminated_length": 491.10145568847656,
      "completions/min_length": 292.25,
      "completions/min_terminated_length": 292.25,
      "epoch": 0.108,
      "grad_norm": 0.9178186655044556,
      "kl": 0.298828125,
      "learning_rate": 1.9996419266713097e-05,
      "loss": 0.0633,
      "num_tokens": 15796566.0,
      "reward": 0.5134114697575569,
      "reward_std": 0.09221521113067865,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9166666865348816,
      "rewards/reasoning_steps_reward/std": 0.17712190747261047,
      "rewards/tag_count_reward/mean": 0.55078125,
      "rewards/tag_count_reward/std": 0.24923434294760227,
      "step": 324
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 960.25,
      "completions/max_terminated_length": 834.75,
      "completions/mean_length": 540.5625,
      "completions/mean_terminated_length": 478.70812225341797,
      "completions/min_length": 326.5,
      "completions/min_terminated_length": 326.5,
      "epoch": 0.10833333333333334,
      "grad_norm": 1.0782766342163086,
      "kl": 0.35205078125,
      "learning_rate": 1.9996101150403543e-05,
      "loss": 0.1742,
      "num_tokens": 15842634.0,
      "reward": 0.548046886920929,
      "reward_std": 0.042725150007754564,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.04929708316922188,
      "rewards/tag_count_reward/mean": 0.55859375,
      "rewards/tag_count_reward/std": 0.24080571345984936,
      "step": 325
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 934.0,
      "completions/max_terminated_length": 697.0,
      "completions/mean_length": 463.09375,
      "completions/mean_terminated_length": 427.4317092895508,
      "completions/min_length": 251.25,
      "completions/min_terminated_length": 251.25,
      "epoch": 0.10866666666666666,
      "grad_norm": 0.9316300749778748,
      "kl": 0.34326171875,
      "learning_rate": 1.9995769500822007e-05,
      "loss": -0.0657,
      "num_tokens": 15881488.0,
      "reward": 0.5242187529802322,
      "reward_std": 0.09430638235062361,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9218750298023224,
      "rewards/reasoning_steps_reward/std": 0.18572967126965523,
      "rewards/tag_count_reward/mean": 0.6328125,
      "rewards/tag_count_reward/std": 0.16978351771831512,
      "step": 326
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 870.75,
      "completions/max_terminated_length": 870.75,
      "completions/mean_length": 579.671875,
      "completions/mean_terminated_length": 579.671875,
      "completions/min_length": 337.5,
      "completions/min_terminated_length": 337.5,
      "epoch": 0.109,
      "grad_norm": 0.873878538608551,
      "kl": 0.31982421875,
      "learning_rate": 1.999542431841749e-05,
      "loss": -0.0013,
      "num_tokens": 15926859.0,
      "reward": 0.5614583343267441,
      "reward_std": 0.030561438761651516,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.640625,
      "rewards/tag_count_reward/std": 0.24114800989627838,
      "step": 327
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 891.75,
      "completions/max_terminated_length": 860.0,
      "completions/mean_length": 568.375,
      "completions/mean_terminated_length": 557.2075958251953,
      "completions/min_length": 337.5,
      "completions/min_terminated_length": 337.5,
      "epoch": 0.10933333333333334,
      "grad_norm": 0.7906535863876343,
      "kl": 0.301025390625,
      "learning_rate": 1.9995065603657317e-05,
      "loss": 0.0167,
      "num_tokens": 15974915.0,
      "reward": 0.6875823885202408,
      "reward_std": 0.2568901313934475,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.12716573476791382,
      "rewards/penalized_accuracy_reward/std": 0.22754468023777008,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.65625,
      "rewards/tag_count_reward/std": 0.21732262521982193,
      "step": 328
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 756.75,
      "completions/max_terminated_length": 756.75,
      "completions/mean_length": 528.4375,
      "completions/mean_terminated_length": 528.4375,
      "completions/min_length": 325.75,
      "completions/min_terminated_length": 325.75,
      "epoch": 0.10966666666666666,
      "grad_norm": 0.8963666558265686,
      "kl": 0.32958984375,
      "learning_rate": 1.999469335702714e-05,
      "loss": -0.0342,
      "num_tokens": 16015903.0,
      "reward": 0.5386718809604645,
      "reward_std": 0.06200896389782429,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9531250149011612,
      "rewards/reasoning_steps_reward/std": 0.0888747088611126,
      "rewards/tag_count_reward/mean": 0.62109375,
      "rewards/tag_count_reward/std": 0.2715306803584099,
      "step": 329
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 818.0,
      "completions/max_terminated_length": 818.0,
      "completions/mean_length": 545.5625,
      "completions/mean_terminated_length": 545.5625,
      "completions/min_length": 291.25,
      "completions/min_terminated_length": 291.25,
      "epoch": 0.11,
      "grad_norm": 0.860392153263092,
      "kl": 0.29736328125,
      "learning_rate": 1.9994307579030925e-05,
      "loss": -0.0076,
      "num_tokens": 16058915.0,
      "reward": 0.5680989325046539,
      "reward_std": 0.023084016982465982,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.70703125,
      "rewards/tag_count_reward/std": 0.14216844737529755,
      "step": 330
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 955.5,
      "completions/max_terminated_length": 930.5,
      "completions/mean_length": 696.234375,
      "completions/mean_terminated_length": 691.7843780517578,
      "completions/min_length": 459.5,
      "completions/min_terminated_length": 459.5,
      "epoch": 0.11033333333333334,
      "grad_norm": 0.34245073795318604,
      "kl": 0.267578125,
      "learning_rate": 1.999390827019096e-05,
      "loss": -0.0036,
      "num_tokens": 16112370.0,
      "reward": 0.5704427063465118,
      "reward_std": 0.015046583488583565,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.73046875,
      "rewards/tag_count_reward/std": 0.049575019627809525,
      "step": 331
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 931.25,
      "completions/max_terminated_length": 926.25,
      "completions/mean_length": 675.40625,
      "completions/mean_terminated_length": 667.1205444335938,
      "completions/min_length": 452.5,
      "completions/min_terminated_length": 452.5,
      "epoch": 0.11066666666666666,
      "grad_norm": 0.736656665802002,
      "kl": 0.2841796875,
      "learning_rate": 1.999349543104785e-05,
      "loss": 0.0279,
      "num_tokens": 16164204.0,
      "reward": 0.5726562142372131,
      "reward_std": 0.007206945912912488,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.7265625,
      "rewards/tag_count_reward/std": 0.07206955552101135,
      "step": 332
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 866.0,
      "completions/max_terminated_length": 864.5,
      "completions/mean_length": 637.90625,
      "completions/mean_terminated_length": 625.4010467529297,
      "completions/min_length": 379.25,
      "completions/min_terminated_length": 379.25,
      "epoch": 0.111,
      "grad_norm": 0.7842462062835693,
      "kl": 0.24560546875,
      "learning_rate": 1.999306906216052e-05,
      "loss": 0.034,
      "num_tokens": 16216166.0,
      "reward": 0.5722655951976776,
      "reward_std": 0.0074825764168053865,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.72265625,
      "rewards/tag_count_reward/std": 0.07482585124671459,
      "step": 333
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.25,
      "completions/max_length": 960.25,
      "completions/max_terminated_length": 860.25,
      "completions/mean_length": 720.734375,
      "completions/mean_terminated_length": 653.4479217529297,
      "completions/min_length": 497.0,
      "completions/min_terminated_length": 497.0,
      "epoch": 0.11133333333333334,
      "grad_norm": 0.5861374735832214,
      "kl": 0.278564453125,
      "learning_rate": 1.999262916410621e-05,
      "loss": 0.1111,
      "num_tokens": 16272725.0,
      "reward": 0.5554687529802322,
      "reward_std": 0.02883127611130476,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.03359273821115494,
      "rewards/tag_count_reward/mean": 0.6328125,
      "rewards/tag_count_reward/std": 0.14413107000291348,
      "step": 334
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 850.75,
      "completions/max_terminated_length": 850.75,
      "completions/mean_length": 604.015625,
      "completions/mean_terminated_length": 604.015625,
      "completions/min_length": 425.5,
      "completions/min_terminated_length": 425.5,
      "epoch": 0.11166666666666666,
      "grad_norm": 0.6174341440200806,
      "kl": 0.27734375,
      "learning_rate": 1.9992175737480487e-05,
      "loss": -0.0022,
      "num_tokens": 16319542.0,
      "reward": 0.6292968690395355,
      "reward_std": 0.15099719865247607,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0546875,
      "rewards/penalized_accuracy_reward/std": 0.14943470060825348,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.74609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 335
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1010.75,
      "completions/max_terminated_length": 955.5,
      "completions/mean_length": 762.46875,
      "completions/mean_terminated_length": 724.74658203125,
      "completions/min_length": 403.75,
      "completions/min_terminated_length": 403.75,
      "epoch": 0.112,
      "grad_norm": 0.8106991052627563,
      "kl": 0.32421875,
      "learning_rate": 1.9991708782897214e-05,
      "loss": 0.1122,
      "num_tokens": 16378708.0,
      "reward": 0.598828136920929,
      "reward_std": 0.11693336139433086,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.02734375,
      "rewards/penalized_accuracy_reward/std": 0.1093750074505806,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.71484375,
      "rewards/tag_count_reward/std": 0.07558366656303406,
      "step": 336
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 902.0,
      "completions/max_terminated_length": 902.0,
      "completions/mean_length": 558.296875,
      "completions/mean_terminated_length": 558.296875,
      "completions/min_length": 357.25,
      "completions/min_terminated_length": 357.25,
      "epoch": 0.11233333333333333,
      "grad_norm": 0.5837851166725159,
      "kl": 0.323486328125,
      "learning_rate": 1.9991228300988586e-05,
      "loss": -0.0411,
      "num_tokens": 16422359.0,
      "reward": 0.5695312321186066,
      "reward_std": 0.009778629755601287,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.6953125,
      "rewards/tag_count_reward/std": 0.09778633713722229,
      "step": 337
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 916.25,
      "completions/max_terminated_length": 916.25,
      "completions/mean_length": 701.453125,
      "completions/mean_terminated_length": 701.453125,
      "completions/min_length": 493.5,
      "completions/min_terminated_length": 493.5,
      "epoch": 0.11266666666666666,
      "grad_norm": 0.13602034747600555,
      "kl": 0.297607421875,
      "learning_rate": 1.9990734292405102e-05,
      "loss": 0.0119,
      "num_tokens": 16478340.0,
      "reward": 0.574999988079071,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.75,
      "rewards/tag_count_reward/std": 0.0,
      "step": 338
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 954.75,
      "completions/max_terminated_length": 931.25,
      "completions/mean_length": 700.265625,
      "completions/mean_terminated_length": 695.4854278564453,
      "completions/min_length": 471.25,
      "completions/min_terminated_length": 471.25,
      "epoch": 0.113,
      "grad_norm": 0.576396107673645,
      "kl": 0.301025390625,
      "learning_rate": 1.9990226757815582e-05,
      "loss": 0.0687,
      "num_tokens": 16533589.0,
      "reward": 0.5742187201976776,
      "reward_std": 0.0031249960884451866,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.7421875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 339
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 939.5,
      "completions/max_terminated_length": 937.25,
      "completions/mean_length": 723.921875,
      "completions/mean_terminated_length": 719.9479217529297,
      "completions/min_length": 458.5,
      "completions/min_terminated_length": 458.5,
      "epoch": 0.11333333333333333,
      "grad_norm": 0.5036214590072632,
      "kl": 0.281982421875,
      "learning_rate": 1.998970569790715e-05,
      "loss": -0.0162,
      "num_tokens": 16591232.0,
      "reward": 0.5712239295244217,
      "reward_std": 0.013554674573242664,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.73828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 340
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 874.75,
      "completions/max_terminated_length": 874.75,
      "completions/mean_length": 679.328125,
      "completions/mean_terminated_length": 679.328125,
      "completions/min_length": 461.0,
      "completions/min_terminated_length": 461.0,
      "epoch": 0.11366666666666667,
      "grad_norm": 0.11010044068098068,
      "kl": 0.271240234375,
      "learning_rate": 1.998917111338525e-05,
      "loss": 0.0109,
      "num_tokens": 16645077.0,
      "reward": 0.574999988079071,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.75,
      "rewards/tag_count_reward/std": 0.0,
      "step": 341
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 869.5,
      "completions/max_terminated_length": 869.5,
      "completions/mean_length": 642.3125,
      "completions/mean_terminated_length": 642.3125,
      "completions/min_length": 451.25,
      "completions/min_terminated_length": 451.25,
      "epoch": 0.114,
      "grad_norm": 0.4419842064380646,
      "kl": 0.294189453125,
      "learning_rate": 1.9988623004973625e-05,
      "loss": 0.0146,
      "num_tokens": 16696777.0,
      "reward": 0.6437411606311798,
      "reward_std": 0.20553494337946177,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.07134534418582916,
      "rewards/penalized_accuracy_reward/std": 0.1951182782649994,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.75,
      "rewards/tag_count_reward/std": 0.0,
      "step": 342
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 933.75,
      "completions/max_terminated_length": 933.75,
      "completions/mean_length": 598.890625,
      "completions/mean_terminated_length": 598.890625,
      "completions/min_length": 427.25,
      "completions/min_terminated_length": 427.25,
      "epoch": 0.11433333333333333,
      "grad_norm": 0.3295036852359772,
      "kl": 0.345703125,
      "learning_rate": 1.9988061373414342e-05,
      "loss": 0.0098,
      "num_tokens": 16746962.0,
      "reward": 0.5746093541383743,
      "reward_std": 0.0015624980442225933,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.74609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 343
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 912.0,
      "completions/max_terminated_length": 912.0,
      "completions/mean_length": 693.859375,
      "completions/mean_terminated_length": 693.859375,
      "completions/min_length": 460.75,
      "completions/min_terminated_length": 460.75,
      "epoch": 0.11466666666666667,
      "grad_norm": 0.6154027581214905,
      "kl": 0.265380859375,
      "learning_rate": 1.9987486219467764e-05,
      "loss": 0.0231,
      "num_tokens": 16803385.0,
      "reward": 0.5977044999599457,
      "reward_std": 0.17415135446935892,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.03312116861343384,
      "rewards/penalized_accuracy_reward/std": 0.13248467445373535,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.0833333320915699,
      "rewards/tag_count_reward/mean": 0.75,
      "rewards/tag_count_reward/std": 0.0,
      "step": 344
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 877.25,
      "completions/max_terminated_length": 863.0,
      "completions/mean_length": 706.3125,
      "completions/mean_terminated_length": 695.7043304443359,
      "completions/min_length": 444.0,
      "completions/min_terminated_length": 444.0,
      "epoch": 0.115,
      "grad_norm": 0.567071795463562,
      "kl": 0.263427734375,
      "learning_rate": 1.998689754391257e-05,
      "loss": 0.0459,
      "num_tokens": 16859837.0,
      "reward": 0.6614129096269608,
      "reward_std": 0.2113286810927093,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.09097020328044891,
      "rewards/penalized_accuracy_reward/std": 0.19595451653003693,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.73046875,
      "rewards/tag_count_reward/std": 0.049575019627809525,
      "step": 345
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 850.75,
      "completions/max_terminated_length": 848.75,
      "completions/mean_length": 662.859375,
      "completions/mean_terminated_length": 659.011474609375,
      "completions/min_length": 454.25,
      "completions/min_terminated_length": 454.25,
      "epoch": 0.11533333333333333,
      "grad_norm": 0.44871285557746887,
      "kl": 0.302978515625,
      "learning_rate": 1.9986295347545738e-05,
      "loss": 0.0135,
      "num_tokens": 16911460.0,
      "reward": 0.5993489623069763,
      "reward_std": 0.11980467848479748,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.02734375,
      "rewards/penalized_accuracy_reward/std": 0.1093750074505806,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.74609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 346
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 906.5,
      "completions/max_terminated_length": 906.5,
      "completions/mean_length": 697.1875,
      "completions/mean_terminated_length": 697.1875,
      "completions/min_length": 495.5,
      "completions/min_terminated_length": 495.5,
      "epoch": 0.11566666666666667,
      "grad_norm": 0.30112141370773315,
      "kl": 0.268310546875,
      "learning_rate": 1.998567963118256e-05,
      "loss": 0.0053,
      "num_tokens": 16964912.0,
      "reward": 0.739340677857399,
      "reward_std": 0.25176531076431274,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.16434067487716675,
      "rewards/penalized_accuracy_reward/std": 0.25176534056663513,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.75,
      "rewards/tag_count_reward/std": 0.0,
      "step": 347
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 970.75,
      "completions/max_terminated_length": 961.0,
      "completions/mean_length": 784.640625,
      "completions/mean_terminated_length": 775.7781524658203,
      "completions/min_length": 548.25,
      "completions/min_terminated_length": 548.25,
      "epoch": 0.116,
      "grad_norm": 0.5412817001342773,
      "kl": 0.266357421875,
      "learning_rate": 1.9985050395656617e-05,
      "loss": 0.0267,
      "num_tokens": 17027577.0,
      "reward": 0.7146791964769363,
      "reward_std": 0.3511555069126189,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.14436671882867813,
      "rewards/penalized_accuracy_reward/std": 0.3467045724391937,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.703125,
      "rewards/tag_count_reward/std": 0.15779344737529755,
      "step": 348
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 916.0,
      "completions/max_terminated_length": 914.0,
      "completions/mean_length": 709.125,
      "completions/mean_terminated_length": 705.4073028564453,
      "completions/min_length": 507.5,
      "completions/min_terminated_length": 507.5,
      "epoch": 0.11633333333333333,
      "grad_norm": 0.5882188677787781,
      "kl": 0.2978515625,
      "learning_rate": 1.9984407641819812e-05,
      "loss": 0.0179,
      "num_tokens": 17082321.0,
      "reward": 0.7354599088430405,
      "reward_std": 0.25636449502781034,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.16280364990234375,
      "rewards/penalized_accuracy_reward/std": 0.24956567585468292,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.7265625,
      "rewards/tag_count_reward/std": 0.06798820197582245,
      "step": 349
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 995.0,
      "completions/max_terminated_length": 978.5,
      "completions/mean_length": 765.28125,
      "completions/mean_terminated_length": 745.8104248046875,
      "completions/min_length": 524.25,
      "completions/min_terminated_length": 524.25,
      "epoch": 0.11666666666666667,
      "grad_norm": 0.5826079845428467,
      "kl": 0.281494140625,
      "learning_rate": 1.9983751370542334e-05,
      "loss": 0.0384,
      "num_tokens": 17140627.0,
      "reward": 0.5707031190395355,
      "reward_std": 0.013732579769566655,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.70703125,
      "rewards/tag_count_reward/std": 0.1373258512467146,
      "step": 350
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 951.0,
      "completions/max_terminated_length": 935.0,
      "completions/mean_length": 702.984375,
      "completions/mean_terminated_length": 699.2343902587891,
      "completions/min_length": 555.0,
      "completions/min_terminated_length": 555.0,
      "epoch": 0.117,
      "grad_norm": 0.4956066906452179,
      "kl": 0.2626953125,
      "learning_rate": 1.9983081582712684e-05,
      "loss": -0.0183,
      "num_tokens": 17196450.0,
      "reward": 0.5630208253860474,
      "reward_std": 0.0379206258803606,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.06615880131721497,
      "rewards/tag_count_reward/mean": 0.734375,
      "rewards/tag_count_reward/std": 0.04841229319572449,
      "step": 351
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 960.0,
      "completions/max_terminated_length": 908.5,
      "completions/mean_length": 751.046875,
      "completions/mean_terminated_length": 724.80419921875,
      "completions/min_length": 485.75,
      "completions/min_terminated_length": 485.75,
      "epoch": 0.11733333333333333,
      "grad_norm": 0.6335891485214233,
      "kl": 0.266357421875,
      "learning_rate": 1.9982398279237657e-05,
      "loss": 0.0123,
      "num_tokens": 17256325.0,
      "reward": 0.6546921133995056,
      "reward_std": 0.29282613936811686,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.08789525181055069,
      "rewards/penalized_accuracy_reward/std": 0.28226570785045624,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.66796875,
      "rewards/tag_count_reward/std": 0.21457599848508835,
      "step": 352
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.203125,
      "completions/max_length": 1008.25,
      "completions/max_terminated_length": 989.25,
      "completions/mean_length": 836.96875,
      "completions/mean_terminated_length": 810.9872283935547,
      "completions/min_length": 608.75,
      "completions/min_terminated_length": 608.75,
      "epoch": 0.11766666666666667,
      "grad_norm": 0.5203900933265686,
      "kl": 0.265380859375,
      "learning_rate": 1.998170146104234e-05,
      "loss": -0.0027,
      "num_tokens": 17318691.0,
      "reward": 0.5458333343267441,
      "reward_std": 0.07285358663648367,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9635416865348816,
      "rewards/reasoning_steps_reward/std": 0.12865879759192467,
      "rewards/tag_count_reward/mean": 0.640625,
      "rewards/tag_count_reward/std": 0.1502092145383358,
      "step": 353
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 995.25,
      "completions/max_terminated_length": 942.0,
      "completions/mean_length": 791.03125,
      "completions/mean_terminated_length": 752.4354553222656,
      "completions/min_length": 507.5,
      "completions/min_terminated_length": 507.5,
      "epoch": 0.118,
      "grad_norm": 0.53293377161026,
      "kl": 0.2705078125,
      "learning_rate": 1.998099112907013e-05,
      "loss": -0.0189,
      "num_tokens": 17379429.0,
      "reward": 0.535026028752327,
      "reward_std": 0.11204528529196978,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.927083358168602,
      "rewards/reasoning_steps_reward/std": 0.22220106050372124,
      "rewards/tag_count_reward/mean": 0.71484375,
      "rewards/tag_count_reward/std": 0.07449322193861008,
      "step": 354
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 997.75,
      "completions/max_terminated_length": 934.25,
      "completions/mean_length": 775.90625,
      "completions/mean_terminated_length": 751.3139343261719,
      "completions/min_length": 491.0,
      "completions/min_terminated_length": 491.0,
      "epoch": 0.11833333333333333,
      "grad_norm": 0.6207841634750366,
      "kl": 0.26513671875,
      "learning_rate": 1.9980267284282718e-05,
      "loss": 0.0175,
      "num_tokens": 17440223.0,
      "reward": 0.5654947757720947,
      "reward_std": 0.028851188253611326,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.70703125,
      "rewards/tag_count_reward/std": 0.09567352384328842,
      "step": 355
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 966.0,
      "completions/max_terminated_length": 952.75,
      "completions/mean_length": 815.171875,
      "completions/mean_terminated_length": 801.4709930419922,
      "completions/min_length": 624.75,
      "completions/min_terminated_length": 624.75,
      "epoch": 0.11866666666666667,
      "grad_norm": 0.34053659439086914,
      "kl": 0.239990234375,
      "learning_rate": 1.9979529927660076e-05,
      "loss": 0.0033,
      "num_tokens": 17502378.0,
      "reward": 0.5493489503860474,
      "reward_std": 0.05448907986283302,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9635416716337204,
      "rewards/reasoning_steps_reward/std": 0.10987519100308418,
      "rewards/tag_count_reward/mean": 0.67578125,
      "rewards/tag_count_reward/std": 0.08297448605298996,
      "step": 356
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.203125,
      "completions/max_length": 1005.5,
      "completions/max_terminated_length": 979.75,
      "completions/mean_length": 873.84375,
      "completions/mean_terminated_length": 856.6114654541016,
      "completions/min_length": 668.0,
      "completions/min_terminated_length": 668.0,
      "epoch": 0.119,
      "grad_norm": 0.5661823749542236,
      "kl": 0.26953125,
      "learning_rate": 1.9978779060200483e-05,
      "loss": 0.0126,
      "num_tokens": 17568848.0,
      "reward": 0.7227516770362854,
      "reward_std": 0.36486852215602994,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.16168397665023804,
      "rewards/penalized_accuracy_reward/std": 0.34969785064458847,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.63671875,
      "rewards/tag_count_reward/std": 0.09947755187749863,
      "step": 357
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.4375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.25,
      "completions/mean_length": 912.40625,
      "completions/mean_terminated_length": 845.8678894042969,
      "completions/min_length": 715.75,
      "completions/min_terminated_length": 715.75,
      "epoch": 0.11933333333333333,
      "grad_norm": 0.5193860530853271,
      "kl": 0.23828125,
      "learning_rate": 1.9978014682920503e-05,
      "loss": 0.0729,
      "num_tokens": 17636410.0,
      "reward": 0.5519531220197678,
      "reward_std": 0.023164119804278016,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.51953125,
      "rewards/tag_count_reward/std": 0.23164120875298977,
      "step": 358
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 975.0,
      "completions/mean_length": 811.34375,
      "completions/mean_terminated_length": 788.9231262207031,
      "completions/min_length": 602.5,
      "completions/min_terminated_length": 602.5,
      "epoch": 0.11966666666666667,
      "grad_norm": 0.6877517104148865,
      "kl": 0.2841796875,
      "learning_rate": 1.997723679685499e-05,
      "loss": 0.0868,
      "num_tokens": 17700176.0,
      "reward": 0.5710937082767487,
      "reward_std": 0.01112984400242567,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.7109375,
      "rewards/tag_count_reward/std": 0.11129852384328842,
      "step": 359
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 882.5,
      "completions/max_terminated_length": 880.75,
      "completions/mean_length": 619.78125,
      "completions/mean_terminated_length": 615.1302185058594,
      "completions/min_length": 470.25,
      "completions/min_terminated_length": 470.25,
      "epoch": 0.12,
      "grad_norm": 0.6614722013473511,
      "kl": 0.2841796875,
      "learning_rate": 1.9976445403057095e-05,
      "loss": 0.0213,
      "num_tokens": 17753586.0,
      "reward": 0.568229153752327,
      "reward_std": 0.025533841457217932,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.734375,
      "rewards/tag_count_reward/std": 0.0625,
      "step": 360
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 1017.0,
      "completions/max_terminated_length": 977.5,
      "completions/mean_length": 794.515625,
      "completions/mean_terminated_length": 777.9619293212891,
      "completions/min_length": 543.25,
      "completions/min_terminated_length": 543.25,
      "epoch": 0.12033333333333333,
      "grad_norm": 0.587173342704773,
      "kl": 0.2568359375,
      "learning_rate": 1.9975640502598243e-05,
      "loss": 0.0382,
      "num_tokens": 17813075.0,
      "reward": 0.7127092778682709,
      "reward_std": 0.27483612578362226,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.14942806959152222,
      "rewards/penalized_accuracy_reward/std": 0.2673214375972748,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.0624999962747097,
      "rewards/tag_count_reward/mean": 0.7109375,
      "rewards/tag_count_reward/std": 0.12654344737529755,
      "step": 361
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 812.25,
      "completions/max_terminated_length": 802.0,
      "completions/mean_length": 624.359375,
      "completions/mean_terminated_length": 620.5625152587891,
      "completions/min_length": 443.0,
      "completions/min_terminated_length": 443.0,
      "epoch": 0.12066666666666667,
      "grad_norm": 0.578199565410614,
      "kl": 0.265380859375,
      "learning_rate": 1.9974822096568157e-05,
      "loss": 0.0284,
      "num_tokens": 17861130.0,
      "reward": 0.7899739593267441,
      "reward_std": 0.375191253144294,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.21875,
      "rewards/penalized_accuracy_reward/std": 0.3681847155094147,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.73828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 362
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 839.5,
      "completions/max_terminated_length": 809.0,
      "completions/mean_length": 624.171875,
      "completions/mean_terminated_length": 610.0516967773438,
      "completions/min_length": 415.25,
      "completions/min_terminated_length": 415.25,
      "epoch": 0.121,
      "grad_norm": 0.5601195693016052,
      "kl": 0.2763671875,
      "learning_rate": 1.9973990186074844e-05,
      "loss": 0.0437,
      "num_tokens": 17911253.0,
      "reward": 0.5708333253860474,
      "reward_std": 0.012949130265042186,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.734375,
      "rewards/tag_count_reward/std": 0.04081955552101135,
      "step": 363
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 754.75,
      "completions/max_terminated_length": 754.75,
      "completions/mean_length": 596.171875,
      "completions/mean_terminated_length": 596.171875,
      "completions/min_length": 450.75,
      "completions/min_terminated_length": 450.75,
      "epoch": 0.12133333333333333,
      "grad_norm": 0.6716346144676208,
      "kl": 0.2734375,
      "learning_rate": 1.997314477224458e-05,
      "loss": 0.0063,
      "num_tokens": 17961216.0,
      "reward": 0.55859375,
      "reward_std": 0.05590894632041454,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9687500298023224,
      "rewards/reasoning_steps_reward/std": 0.11179707944393158,
      "rewards/tag_count_reward/mean": 0.7421875,
      "rewards/tag_count_reward/std": 0.021347815170884132,
      "step": 364
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 684.25,
      "completions/max_terminated_length": 684.25,
      "completions/mean_length": 487.234375,
      "completions/mean_terminated_length": 487.234375,
      "completions/min_length": 359.25,
      "completions/min_terminated_length": 359.25,
      "epoch": 0.12166666666666667,
      "grad_norm": 0.4668358266353607,
      "kl": 0.3193359375,
      "learning_rate": 1.9972285856221944e-05,
      "loss": 0.0521,
      "num_tokens": 18000831.0,
      "reward": 0.7272274792194366,
      "reward_std": 0.37441834807395935,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.1626441404223442,
      "rewards/penalized_accuracy_reward/std": 0.3894267678260803,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.048112526535987854,
      "rewards/tag_count_reward/mean": 0.75,
      "rewards/tag_count_reward/std": 0.0,
      "step": 365
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 586.5,
      "completions/max_terminated_length": 586.5,
      "completions/mean_length": 485.96875,
      "completions/mean_terminated_length": 485.96875,
      "completions/min_length": 339.75,
      "completions/min_terminated_length": 339.75,
      "epoch": 0.122,
      "grad_norm": 0.751215398311615,
      "kl": 0.33203125,
      "learning_rate": 1.9971413439169777e-05,
      "loss": 0.0108,
      "num_tokens": 18044077.0,
      "reward": 0.6277906894683838,
      "reward_std": 0.22004716284573078,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.06516048312187195,
      "rewards/penalized_accuracy_reward/std": 0.17814074456691742,
      "rewards/reasoning_steps_reward/mean": 0.9791666865348816,
      "rewards/reasoning_steps_reward/std": 0.0833333283662796,
      "rewards/tag_count_reward/mean": 0.73046875,
      "rewards/tag_count_reward/std": 0.078125,
      "step": 366
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 613.25,
      "completions/max_terminated_length": 613.25,
      "completions/mean_length": 449.234375,
      "completions/mean_terminated_length": 449.234375,
      "completions/min_length": 348.75,
      "completions/min_terminated_length": 348.75,
      "epoch": 0.12233333333333334,
      "grad_norm": 0.7573283314704895,
      "kl": 0.361328125,
      "learning_rate": 1.9970527522269204e-05,
      "loss": 0.0358,
      "num_tokens": 18084364.0,
      "reward": 0.5644531100988388,
      "reward_std": 0.03857939247973263,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.0625,
      "rewards/tag_count_reward/mean": 0.72265625,
      "rewards/tag_count_reward/std": 0.08957063034176826,
      "step": 367
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 738.0,
      "completions/max_terminated_length": 636.75,
      "completions/mean_length": 489.0625,
      "completions/mean_terminated_length": 480.8500061035156,
      "completions/min_length": 368.0,
      "completions/min_terminated_length": 368.0,
      "epoch": 0.12266666666666666,
      "grad_norm": 0.8003387451171875,
      "kl": 0.3837890625,
      "learning_rate": 1.9969628106719632e-05,
      "loss": 0.0422,
      "num_tokens": 18124912.0,
      "reward": 0.5933363288640976,
      "reward_std": 0.13590667862445116,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.02862280048429966,
      "rewards/penalized_accuracy_reward/std": 0.11449120938777924,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.028463751077651978,
      "rewards/tag_count_reward/mean": 0.69921875,
      "rewards/tag_count_reward/std": 0.13621489331126213,
      "step": 368
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 782.0,
      "completions/max_terminated_length": 732.25,
      "completions/mean_length": 533.4375,
      "completions/mean_terminated_length": 524.9697952270508,
      "completions/min_length": 367.25,
      "completions/min_terminated_length": 367.25,
      "epoch": 0.123,
      "grad_norm": 0.5432648062705994,
      "kl": 0.32763671875,
      "learning_rate": 1.9968715193738738e-05,
      "loss": 0.0231,
      "num_tokens": 18168988.0,
      "reward": 0.6612445116043091,
      "reward_std": 0.28088031709194183,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0874163992702961,
      "rewards/penalized_accuracy_reward/std": 0.28035029768943787,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.73828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 369
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 870.25,
      "completions/max_terminated_length": 714.75,
      "completions/mean_length": 544.421875,
      "completions/mean_terminated_length": 522.0605773925781,
      "completions/min_length": 387.5,
      "completions/min_terminated_length": 387.5,
      "epoch": 0.12333333333333334,
      "grad_norm": 0.9099456667900085,
      "kl": 0.35009765625,
      "learning_rate": 1.9967788784562474e-05,
      "loss": 0.1243,
      "num_tokens": 18214103.0,
      "reward": 0.5684895664453506,
      "reward_std": 0.02152151893824339,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.7109375,
      "rewards/tag_count_reward/std": 0.12654344737529755,
      "step": 370
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 823.5,
      "completions/max_terminated_length": 763.5,
      "completions/mean_length": 540.8125,
      "completions/mean_terminated_length": 526.511474609375,
      "completions/min_length": 351.75,
      "completions/min_terminated_length": 351.75,
      "epoch": 0.12366666666666666,
      "grad_norm": 0.8498043417930603,
      "kl": 0.3798828125,
      "learning_rate": 1.996684888044506e-05,
      "loss": 0.0597,
      "num_tokens": 18257963.0,
      "reward": 0.5631510317325592,
      "reward_std": 0.0337344182189554,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.68359375,
      "rewards/tag_count_reward/std": 0.16335688158869743,
      "step": 371
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 723.0,
      "completions/max_terminated_length": 599.25,
      "completions/mean_length": 525.84375,
      "completions/mean_terminated_length": 491.234375,
      "completions/min_length": 404.75,
      "completions/min_terminated_length": 404.75,
      "epoch": 0.124,
      "grad_norm": 1.7754876613616943,
      "kl": 0.46435546875,
      "learning_rate": 1.9965895482659e-05,
      "loss": 0.1219,
      "num_tokens": 18300225.0,
      "reward": 0.5675781071186066,
      "reward_std": 0.01923357043415308,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.67578125,
      "rewards/tag_count_reward/std": 0.19233575090765953,
      "step": 372
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 848.25,
      "completions/mean_length": 606.40625,
      "completions/mean_terminated_length": 537.673095703125,
      "completions/min_length": 349.5,
      "completions/min_terminated_length": 349.5,
      "epoch": 0.12433333333333334,
      "grad_norm": 1.5454201698303223,
      "kl": 0.68017578125,
      "learning_rate": 1.9964928592495046e-05,
      "loss": 0.0783,
      "num_tokens": 18348267.0,
      "reward": 0.6182190477848053,
      "reward_std": 0.28114900551736355,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.06665654852986336,
      "rewards/penalized_accuracy_reward/std": 0.26662619411945343,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.515625,
      "rewards/tag_count_reward/std": 0.30157821998000145,
      "step": 373
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.515625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 588.75,
      "completions/mean_length": 739.40625,
      "completions/mean_terminated_length": 440.1388931274414,
      "completions/min_length": 317.0,
      "completions/min_terminated_length": 317.0,
      "epoch": 0.12466666666666666,
      "grad_norm": 4.628246784210205,
      "kl": 0.96484375,
      "learning_rate": 1.9963948211262233e-05,
      "loss": 0.3434,
      "num_tokens": 18404981.0,
      "reward": 0.5234375149011612,
      "reward_std": 0.032967695500701666,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.234375,
      "rewards/tag_count_reward/std": 0.3296769931912422,
      "step": 374
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 674.5,
      "completions/mean_length": 785.046875,
      "completions/mean_terminated_length": 472.26390075683594,
      "completions/min_length": 319.25,
      "completions/min_terminated_length": 319.25,
      "epoch": 0.125,
      "grad_norm": 5.922732830047607,
      "kl": 2.6875,
      "learning_rate": 1.996295434028785e-05,
      "loss": 0.303,
      "num_tokens": 18464216.0,
      "reward": 0.5171875059604645,
      "reward_std": 0.02666703937575221,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.171875,
      "rewards/tag_count_reward/std": 0.26667042449116707,
      "step": 375
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 518.0,
      "completions/mean_length": 871.25,
      "completions/mean_terminated_length": 375.8041763305664,
      "completions/min_length": 272.0,
      "completions/min_terminated_length": 272.0,
      "epoch": 0.12533333333333332,
      "grad_norm": 17.34748077392578,
      "kl": 13.953125,
      "learning_rate": 1.9961946980917457e-05,
      "loss": 0.7341,
      "num_tokens": 18531832.0,
      "reward": 0.5095052123069763,
      "reward_std": 0.02496273792348802,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.12109375,
      "rewards/tag_count_reward/std": 0.21139980107545853,
      "step": 376
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.75,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 379.25,
      "completions/mean_length": 859.453125,
      "completions/mean_terminated_length": 333.0506057739258,
      "completions/min_length": 272.5,
      "completions/min_terminated_length": 272.5,
      "epoch": 0.12566666666666668,
      "grad_norm": 5.47519588470459,
      "kl": 5.6015625,
      "learning_rate": 1.9960926134514875e-05,
      "loss": 0.4119,
      "num_tokens": 18597093.0,
      "reward": 0.5238281190395355,
      "reward_std": 0.0280300946906209,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.23828125,
      "rewards/tag_count_reward/std": 0.28030097112059593,
      "step": 377
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 616.75,
      "completions/mean_length": 892.125,
      "completions/mean_terminated_length": 459.1666793823242,
      "completions/min_length": 331.75,
      "completions/min_terminated_length": 331.75,
      "epoch": 0.126,
      "grad_norm": 1.6290124654769897,
      "kl": 1.328125,
      "learning_rate": 1.995989180246218e-05,
      "loss": 0.1209,
      "num_tokens": 18666829.0,
      "reward": 0.583984375,
      "reward_std": 0.2336447136476636,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0546875,
      "rewards/penalized_accuracy_reward/std": 0.2187500149011612,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.29296875,
      "rewards/tag_count_reward/std": 0.3123226538300514,
      "step": 378
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.5625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 748.75,
      "completions/mean_length": 807.015625,
      "completions/mean_terminated_length": 535.275016784668,
      "completions/min_length": 323.5,
      "completions/min_terminated_length": 323.5,
      "epoch": 0.12633333333333333,
      "grad_norm": 4.420656681060791,
      "kl": 0.716796875,
      "learning_rate": 1.9958843986159705e-05,
      "loss": 0.2669,
      "num_tokens": 18729774.0,
      "reward": 0.542187511920929,
      "reward_std": 0.031155919656157494,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.421875,
      "rewards/tag_count_reward/std": 0.31155921518802643,
      "step": 379
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.765625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 650.5,
      "completions/mean_length": 921.90625,
      "completions/mean_terminated_length": 532.9642944335938,
      "completions/min_length": 427.75,
      "completions/min_terminated_length": 427.75,
      "epoch": 0.12666666666666668,
      "grad_norm": 3.003019332885742,
      "kl": 1.05859375,
      "learning_rate": 1.9957782687026046e-05,
      "loss": 0.1814,
      "num_tokens": 18800120.0,
      "reward": 0.5316406339406967,
      "reward_std": 0.028903153259307146,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.31640625,
      "rewards/tag_count_reward/std": 0.2890315502882004,
      "step": 380
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.9375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 297.5,
      "completions/mean_length": 985.6875,
      "completions/mean_terminated_length": 285.875,
      "completions/min_length": 530.25,
      "completions/min_terminated_length": 274.25,
      "epoch": 0.127,
      "grad_norm": 2.79579758644104,
      "kl": 1.951171875,
      "learning_rate": 1.9956707906498046e-05,
      "loss": 0.1413,
      "num_tokens": 18871812.0,
      "reward": 0.5439791083335876,
      "reward_std": 0.1430248417891562,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.031479090452194214,
      "rewards/penalized_accuracy_reward/std": 0.12591636180877686,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.125,
      "rewards/tag_count_reward/std": 0.2432589866220951,
      "step": 381
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 952.25,
      "completions/mean_length": 770.203125,
      "completions/mean_terminated_length": 614.058349609375,
      "completions/min_length": 251.25,
      "completions/min_terminated_length": 251.25,
      "epoch": 0.12733333333333333,
      "grad_norm": 4.434508800506592,
      "kl": 1.71875,
      "learning_rate": 1.99556196460308e-05,
      "loss": 0.2662,
      "num_tokens": 18929281.0,
      "reward": 0.5570312291383743,
      "reward_std": 0.026602684520184994,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.5703125,
      "rewards/tag_count_reward/std": 0.26602689176797867,
      "step": 382
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 992.0,
      "completions/max_terminated_length": 834.0,
      "completions/mean_length": 495.375,
      "completions/mean_terminated_length": 461.93975830078125,
      "completions/min_length": 225.75,
      "completions/min_terminated_length": 225.75,
      "epoch": 0.12766666666666668,
      "grad_norm": 3.1257944107055664,
      "kl": 2.83984375,
      "learning_rate": 1.9954517907097663e-05,
      "loss": 0.2395,
      "num_tokens": 18970041.0,
      "reward": 0.5675781071186066,
      "reward_std": 0.017155689420178533,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.67578125,
      "rewards/tag_count_reward/std": 0.17155694775283337,
      "step": 383
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 899.0,
      "completions/max_terminated_length": 882.5,
      "completions/mean_length": 484.734375,
      "completions/mean_terminated_length": 474.9291687011719,
      "completions/min_length": 206.5,
      "completions/min_terminated_length": 206.5,
      "epoch": 0.128,
      "grad_norm": 4.114688396453857,
      "kl": 2.6572265625,
      "learning_rate": 1.9953402691190218e-05,
      "loss": 0.1518,
      "num_tokens": 19010744.0,
      "reward": 0.5683593451976776,
      "reward_std": 0.013292336370795965,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.68359375,
      "rewards/tag_count_reward/std": 0.1329234316945076,
      "step": 384
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 1022.0,
      "completions/max_terminated_length": 944.5,
      "completions/mean_length": 647.09375,
      "completions/mean_terminated_length": 613.0419769287109,
      "completions/min_length": 319.75,
      "completions/min_terminated_length": 319.75,
      "epoch": 0.12833333333333333,
      "grad_norm": 2.3895230293273926,
      "kl": 0.8369140625,
      "learning_rate": 1.9952273999818312e-05,
      "loss": 0.14,
      "num_tokens": 19062798.0,
      "reward": 0.5722656100988388,
      "reward_std": 0.006789007456973195,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.72265625,
      "rewards/tag_count_reward/std": 0.06789018586277962,
      "step": 385
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 1017.25,
      "completions/max_terminated_length": 948.25,
      "completions/mean_length": 666.53125,
      "completions/mean_terminated_length": 637.3544921875,
      "completions/min_length": 382.75,
      "completions/min_terminated_length": 382.75,
      "epoch": 0.12866666666666668,
      "grad_norm": 4.826654434204102,
      "kl": 2.22265625,
      "learning_rate": 1.9951131834510034e-05,
      "loss": 0.2538,
      "num_tokens": 19114320.0,
      "reward": 0.567187488079071,
      "reward_std": 0.022259699180722237,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.671875,
      "rewards/tag_count_reward/std": 0.22259704768657684,
      "step": 386
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.296875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 863.5,
      "completions/mean_length": 737.1875,
      "completions/mean_terminated_length": 614.6210479736328,
      "completions/min_length": 395.25,
      "completions/min_terminated_length": 395.25,
      "epoch": 0.129,
      "grad_norm": 18.4852294921875,
      "kl": 16.25,
      "learning_rate": 1.99499761968117e-05,
      "loss": 0.8592,
      "num_tokens": 19170492.0,
      "reward": 0.5429687350988388,
      "reward_std": 0.04709636978805065,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.03359273821115494,
      "rewards/tag_count_reward/mean": 0.5078125,
      "rewards/tag_count_reward/std": 0.33248988538980484,
      "step": 387
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.296875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 808.25,
      "completions/mean_length": 687.9375,
      "completions/mean_terminated_length": 549.6305084228516,
      "completions/min_length": 338.25,
      "completions/min_terminated_length": 338.25,
      "epoch": 0.12933333333333333,
      "grad_norm": 10.924671173095703,
      "kl": 12.734375,
      "learning_rate": 1.9948807088287884e-05,
      "loss": 0.7275,
      "num_tokens": 19224488.0,
      "reward": 0.5419270843267441,
      "reward_std": 0.0634591830894351,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9791666865348816,
      "rewards/reasoning_steps_reward/std": 0.0833333283662796,
      "rewards/tag_count_reward/mean": 0.5234375,
      "rewards/tag_count_reward/std": 0.31878840923309326,
      "step": 388
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.203125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 920.25,
      "completions/mean_length": 768.328125,
      "completions/mean_terminated_length": 704.0210876464844,
      "completions/min_length": 384.75,
      "completions/min_terminated_length": 384.75,
      "epoch": 0.12966666666666668,
      "grad_norm": 2.4064109325408936,
      "kl": 1.265625,
      "learning_rate": 1.9947624510521385e-05,
      "loss": 0.1485,
      "num_tokens": 19282669.0,
      "reward": 0.5677083283662796,
      "reward_std": 0.017858162289485335,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.703125,
      "rewards/tag_count_reward/std": 0.09968777745962143,
      "step": 389
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 966.0,
      "completions/max_terminated_length": 955.25,
      "completions/mean_length": 700.609375,
      "completions/mean_terminated_length": 683.7254638671875,
      "completions/min_length": 413.5,
      "completions/min_terminated_length": 413.5,
      "epoch": 0.13,
      "grad_norm": 2.1699938774108887,
      "kl": 0.865234375,
      "learning_rate": 1.9946428465113244e-05,
      "loss": 0.0691,
      "num_tokens": 19340228.0,
      "reward": 0.5976562649011612,
      "reward_std": 0.12052598781883717,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.02734375,
      "rewards/penalized_accuracy_reward/std": 0.1093750074505806,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.703125,
      "rewards/tag_count_reward/std": 0.1259822454303503,
      "step": 390
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.953125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 210.25,
      "completions/mean_length": 989.140625,
      "completions/mean_terminated_length": 210.25,
      "completions/min_length": 466.25,
      "completions/min_terminated_length": 210.25,
      "epoch": 0.13033333333333333,
      "grad_norm": 10.679924964904785,
      "kl": 7.421875,
      "learning_rate": 1.9945218953682736e-05,
      "loss": 0.2918,
      "num_tokens": 19413789.0,
      "reward": 0.5009114369750023,
      "reward_std": 0.05319773964583874,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9635417014360428,
      "rewards/reasoning_steps_reward/std": 0.10622458532452583,
      "rewards/tag_count_reward/mean": 0.19140625,
      "rewards/tag_count_reward/std": 0.17257864400744438,
      "step": 391
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.13066666666666665,
      "grad_norm": 6.155830383300781,
      "kl": 3.9375,
      "learning_rate": 1.9943995977867358e-05,
      "loss": 0.1575,
      "num_tokens": 19489197.0,
      "reward": 0.48684895783662796,
      "reward_std": 0.08839214779436588,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9479166865348816,
      "rewards/reasoning_steps_reward/std": 0.17237518727779388,
      "rewards/tag_count_reward/mean": 0.12890625,
      "rewards/tag_count_reward/std": 0.1510934755206108,
      "step": 392
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.131,
      "grad_norm": 2.1352784633636475,
      "kl": 1.34765625,
      "learning_rate": 1.9942759539322845e-05,
      "loss": 0.0539,
      "num_tokens": 19563085.0,
      "reward": 0.4867187440395355,
      "reward_std": 0.09090410731732845,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9531250298023224,
      "rewards/reasoning_steps_reward/std": 0.17429707944393158,
      "rewards/tag_count_reward/mean": 0.1015625,
      "rewards/tag_count_reward/std": 0.16582809947431087,
      "step": 393
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.13133333333333333,
      "grad_norm": 0.7536581754684448,
      "kl": 0.5791015625,
      "learning_rate": 1.9941509639723155e-05,
      "loss": 0.0232,
      "num_tokens": 19640941.0,
      "reward": 0.510937511920929,
      "reward_std": 0.013339085271582007,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.109375,
      "rewards/tag_count_reward/std": 0.13339098542928696,
      "step": 394
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.13166666666666665,
      "grad_norm": 0.6516260504722595,
      "kl": 0.5595703125,
      "learning_rate": 1.9940246280760473e-05,
      "loss": 0.0224,
      "num_tokens": 19716349.0,
      "reward": 0.5140624940395355,
      "reward_std": 0.017406899016350508,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.140625,
      "rewards/tag_count_reward/std": 0.1740690991282463,
      "step": 395
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.132,
      "grad_norm": 1.2135748863220215,
      "kl": 0.5634765625,
      "learning_rate": 1.99389694641452e-05,
      "loss": 0.0225,
      "num_tokens": 19793085.0,
      "reward": 0.5101562589406967,
      "reward_std": 0.015438517788425088,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.1015625,
      "rewards/tag_count_reward/std": 0.15438531525433064,
      "step": 396
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.13233333333333333,
      "grad_norm": 0.4990180432796478,
      "kl": 0.27490234375,
      "learning_rate": 1.9937679191605964e-05,
      "loss": 0.011,
      "num_tokens": 19868333.0,
      "reward": 0.46757811307907104,
      "reward_std": 0.08261131285689771,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9218750149011612,
      "rewards/reasoning_steps_reward/std": 0.15227919816970825,
      "rewards/tag_count_reward/mean": 0.06640625,
      "rewards/tag_count_reward/std": 0.10607585124671459,
      "step": 397
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.13266666666666665,
      "grad_norm": 0.3590187132358551,
      "kl": 0.30517578125,
      "learning_rate": 1.9936375464889608e-05,
      "loss": 0.0122,
      "num_tokens": 19942237.0,
      "reward": 0.4924479126930237,
      "reward_std": 0.05821423279121518,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.1041666641831398,
      "rewards/tag_count_reward/mean": 0.0546875,
      "rewards/tag_count_reward/std": 0.10298692621290684,
      "step": 398
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.984375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 36.25,
      "completions/mean_length": 1010.265625,
      "completions/mean_terminated_length": 36.25,
      "completions/min_length": 804.25,
      "completions/min_terminated_length": 36.25,
      "epoch": 0.133,
      "grad_norm": 0.419431209564209,
      "kl": 0.32568359375,
      "learning_rate": 1.9935058285761185e-05,
      "loss": -0.0386,
      "num_tokens": 20017102.0,
      "reward": 0.4997395724058151,
      "reward_std": 0.013995711575262249,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.0234375,
      "rewards/tag_count_reward/std": 0.07206955552101135,
      "step": 399
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.13333333333333333,
      "grad_norm": 0.6517958641052246,
      "kl": 0.4580078125,
      "learning_rate": 1.9933727656003964e-05,
      "loss": 0.0183,
      "num_tokens": 20094958.0,
      "reward": 0.5682201683521271,
      "reward_std": 0.16847115964628756,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.057673290371894836,
      "rewards/penalized_accuracy_reward/std": 0.15761536359786987,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.10546875,
      "rewards/tag_count_reward/std": 0.15373114496469498,
      "step": 400
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.953125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 72.75,
      "completions/mean_length": 988.0,
      "completions/mean_terminated_length": 64.0,
      "completions/min_length": 826.75,
      "completions/min_terminated_length": 58.75,
      "epoch": 0.13366666666666666,
      "grad_norm": 0.6319084167480469,
      "kl": 0.521484375,
      "learning_rate": 1.9932383577419432e-05,
      "loss": 0.0716,
      "num_tokens": 20170494.0,
      "reward": 0.5425781011581421,
      "reward_std": 0.12124298885464668,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.02734375,
      "rewards/penalized_accuracy_reward/std": 0.1093750074505806,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.15234375,
      "rewards/tag_count_reward/std": 0.14299625158309937,
      "step": 401
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.546875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 851.75,
      "completions/mean_length": 848.671875,
      "completions/mean_terminated_length": 644.4384002685547,
      "completions/min_length": 446.75,
      "completions/min_terminated_length": 446.75,
      "epoch": 0.134,
      "grad_norm": 0.7354158163070679,
      "kl": 0.50830078125,
      "learning_rate": 1.993102605182727e-05,
      "loss": 0.127,
      "num_tokens": 20238521.0,
      "reward": 0.5234374850988388,
      "reward_std": 0.018410819116979837,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.234375,
      "rewards/tag_count_reward/std": 0.18410814180970192,
      "step": 402
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 953.75,
      "completions/max_terminated_length": 953.75,
      "completions/mean_length": 533.84375,
      "completions/mean_terminated_length": 533.84375,
      "completions/min_length": 180.25,
      "completions/min_terminated_length": 180.25,
      "epoch": 0.13433333333333333,
      "grad_norm": 0.7467077374458313,
      "kl": 0.4462890625,
      "learning_rate": 1.992965508106537e-05,
      "loss": -0.115,
      "num_tokens": 20285247.0,
      "reward": 0.5534554272890091,
      "reward_std": 0.17384058702737093,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.031710632145404816,
      "rewards/penalized_accuracy_reward/std": 0.12684252858161926,
      "rewards/reasoning_steps_reward/mean": 0.973958358168602,
      "rewards/reasoning_steps_reward/std": 0.1041666604578495,
      "rewards/tag_count_reward/mean": 0.34765625,
      "rewards/tag_count_reward/std": 0.13082130625844002,
      "step": 403
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.25,
      "completions/mean_length": 858.03125,
      "completions/mean_terminated_length": 764.8402862548828,
      "completions/min_length": 512.5,
      "completions/min_terminated_length": 512.5,
      "epoch": 0.13466666666666666,
      "grad_norm": 0.7295824885368347,
      "kl": 0.3515625,
      "learning_rate": 1.9928270666989835e-05,
      "loss": 0.1089,
      "num_tokens": 20349473.0,
      "reward": 0.5296874940395355,
      "reward_std": 0.017016594298183918,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.296875,
      "rewards/tag_count_reward/std": 0.17016583308577538,
      "step": 404
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1006.0,
      "completions/max_terminated_length": 999.0,
      "completions/mean_length": 796.265625,
      "completions/mean_terminated_length": 744.2351226806641,
      "completions/min_length": 479.0,
      "completions/min_terminated_length": 479.0,
      "epoch": 0.135,
      "grad_norm": 0.7597048282623291,
      "kl": 0.38818359375,
      "learning_rate": 1.9926872811474952e-05,
      "loss": 0.1247,
      "num_tokens": 20410114.0,
      "reward": 0.5359375029802322,
      "reward_std": 0.016042925650253892,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.359375,
      "rewards/tag_count_reward/std": 0.16042909026145935,
      "step": 405
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 813.25,
      "completions/max_terminated_length": 725.0,
      "completions/mean_length": 454.0,
      "completions/mean_terminated_length": 444.6010437011719,
      "completions/min_length": 208.25,
      "completions/min_terminated_length": 208.25,
      "epoch": 0.13533333333333333,
      "grad_norm": 0.9214096069335938,
      "kl": 0.42822265625,
      "learning_rate": 1.9925461516413224e-05,
      "loss": 0.012,
      "num_tokens": 20447826.0,
      "reward": 0.5419271141290665,
      "reward_std": 0.019258577842265368,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.4453125,
      "rewards/tag_count_reward/std": 0.11347946338355541,
      "step": 406
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 528.0,
      "completions/max_terminated_length": 528.0,
      "completions/mean_length": 367.0625,
      "completions/mean_terminated_length": 367.0625,
      "completions/min_length": 179.5,
      "completions/min_terminated_length": 179.5,
      "epoch": 0.13566666666666666,
      "grad_norm": 0.9415867328643799,
      "kl": 0.47021484375,
      "learning_rate": 1.992403678371533e-05,
      "loss": -0.0278,
      "num_tokens": 20481142.0,
      "reward": 0.5308593809604645,
      "reward_std": 0.05694529181346297,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9687500149011612,
      "rewards/reasoning_steps_reward/std": 0.11179708316922188,
      "rewards/tag_count_reward/mean": 0.46484375,
      "rewards/tag_count_reward/std": 0.10490768030285835,
      "step": 407
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 600.25,
      "completions/max_terminated_length": 600.25,
      "completions/mean_length": 372.21875,
      "completions/mean_terminated_length": 372.21875,
      "completions/min_length": 210.75,
      "completions/min_terminated_length": 210.75,
      "epoch": 0.136,
      "grad_norm": 0.909557044506073,
      "kl": 0.4541015625,
      "learning_rate": 1.9922598615310157e-05,
      "loss": -0.033,
      "num_tokens": 20513140.0,
      "reward": 0.48268231749534607,
      "reward_std": 0.129038886167109,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.8802083432674408,
      "rewards/reasoning_steps_reward/std": 0.2561502903699875,
      "rewards/tag_count_reward/mean": 0.42578125,
      "rewards/tag_count_reward/std": 0.12520484067499638,
      "step": 408
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 939.75,
      "completions/max_terminated_length": 784.5,
      "completions/mean_length": 545.140625,
      "completions/mean_terminated_length": 528.7177200317383,
      "completions/min_length": 287.5,
      "completions/min_terminated_length": 287.5,
      "epoch": 0.13633333333333333,
      "grad_norm": 0.7070683836936951,
      "kl": 0.3916015625,
      "learning_rate": 1.9921147013144782e-05,
      "loss": -0.0569,
      "num_tokens": 20555485.0,
      "reward": 0.5261718928813934,
      "reward_std": 0.04742140416055918,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.96875,
      "rewards/reasoning_steps_reward/std": 0.07375510036945343,
      "rewards/tag_count_reward/mean": 0.41796875,
      "rewards/tag_count_reward/std": 0.12568620964884758,
      "step": 409
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 926.0,
      "completions/max_terminated_length": 926.0,
      "completions/mean_length": 685.40625,
      "completions/mean_terminated_length": 685.40625,
      "completions/min_length": 433.25,
      "completions/min_terminated_length": 433.25,
      "epoch": 0.13666666666666666,
      "grad_norm": 0.3983977437019348,
      "kl": 0.33984375,
      "learning_rate": 1.9919681979184452e-05,
      "loss": 0.005,
      "num_tokens": 20609735.0,
      "reward": 0.5492187440395355,
      "reward_std": 0.0031250000465661287,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.4921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 410
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 942.25,
      "completions/max_terminated_length": 937.75,
      "completions/mean_length": 682.890625,
      "completions/mean_terminated_length": 673.7812652587891,
      "completions/min_length": 421.25,
      "completions/min_terminated_length": 421.25,
      "epoch": 0.137,
      "grad_norm": 0.4985494613647461,
      "kl": 0.3798828125,
      "learning_rate": 1.9918203515412616e-05,
      "loss": 0.0338,
      "num_tokens": 20663232.0,
      "reward": 0.630859375,
      "reward_std": 0.17888134391978383,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.08203125,
      "rewards/penalized_accuracy_reward/std": 0.17636188864707947,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.48828125,
      "rewards/tag_count_reward/std": 0.025194555521011353,
      "step": 411
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 980.75,
      "completions/mean_length": 808.875,
      "completions/mean_terminated_length": 773.337158203125,
      "completions/min_length": 520.5,
      "completions/min_terminated_length": 520.5,
      "epoch": 0.13733333333333334,
      "grad_norm": 0.6049057245254517,
      "kl": 0.3701171875,
      "learning_rate": 1.9916711623830904e-05,
      "loss": 0.0982,
      "num_tokens": 20726072.0,
      "reward": 0.5429687649011612,
      "reward_std": 0.014349436154589057,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.4296875,
      "rewards/tag_count_reward/std": 0.1434942465275526,
      "step": 412
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 1021.75,
      "completions/max_terminated_length": 988.75,
      "completions/mean_length": 793.25,
      "completions/mean_terminated_length": 764.8887176513672,
      "completions/min_length": 489.75,
      "completions/min_terminated_length": 489.75,
      "epoch": 0.13766666666666666,
      "grad_norm": 0.5656830072402954,
      "kl": 0.4033203125,
      "learning_rate": 1.9915206306459117e-05,
      "loss": 0.0979,
      "num_tokens": 20786552.0,
      "reward": 0.5460937470197678,
      "reward_std": 0.0076294910395517945,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.4609375,
      "rewards/tag_count_reward/std": 0.07629487104713917,
      "step": 413
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 932.5,
      "completions/max_terminated_length": 862.75,
      "completions/mean_length": 656.46875,
      "completions/mean_terminated_length": 645.9791870117188,
      "completions/min_length": 375.5,
      "completions/min_terminated_length": 375.5,
      "epoch": 0.138,
      "grad_norm": 0.542533814907074,
      "kl": 0.3994140625,
      "learning_rate": 1.9913687565335237e-05,
      "loss": 0.0684,
      "num_tokens": 20836710.0,
      "reward": 0.6936656385660172,
      "reward_std": 0.2626067877281457,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.14483754336833954,
      "rewards/penalized_accuracy_reward/std": 0.2592443823814392,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.48828125,
      "rewards/tag_count_reward/std": 0.046875,
      "step": 414
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 867.75,
      "completions/max_terminated_length": 823.25,
      "completions/mean_length": 624.984375,
      "completions/mean_terminated_length": 606.9050598144531,
      "completions/min_length": 408.75,
      "completions/min_terminated_length": 408.75,
      "epoch": 0.13833333333333334,
      "grad_norm": 0.6048027276992798,
      "kl": 0.4267578125,
      "learning_rate": 1.991215540251542e-05,
      "loss": 0.0707,
      "num_tokens": 20887781.0,
      "reward": 0.5753906220197678,
      "reward_std": 0.11373258696403354,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.02734375,
      "rewards/penalized_accuracy_reward/std": 0.1093750074505806,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.48046875,
      "rewards/tag_count_reward/std": 0.04357585124671459,
      "step": 415
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 741.25,
      "completions/max_terminated_length": 741.25,
      "completions/mean_length": 537.265625,
      "completions/mean_terminated_length": 537.265625,
      "completions/min_length": 342.5,
      "completions/min_terminated_length": 342.5,
      "epoch": 0.13866666666666666,
      "grad_norm": 0.7892355918884277,
      "kl": 0.46240234375,
      "learning_rate": 1.9910609820073986e-05,
      "loss": 0.0576,
      "num_tokens": 20931206.0,
      "reward": 0.5753906220197678,
      "reward_std": 0.11475004884414375,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.02734375,
      "rewards/penalized_accuracy_reward/std": 0.1093750074505806,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.48046875,
      "rewards/tag_count_reward/std": 0.06822281517088413,
      "step": 416
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 637.25,
      "completions/max_terminated_length": 637.25,
      "completions/mean_length": 431.984375,
      "completions/mean_terminated_length": 431.984375,
      "completions/min_length": 262.0,
      "completions/min_terminated_length": 262.0,
      "epoch": 0.139,
      "grad_norm": 0.6798685789108276,
      "kl": 0.41162109375,
      "learning_rate": 1.990905082010344e-05,
      "loss": 0.012,
      "num_tokens": 20968213.0,
      "reward": 0.5440104156732559,
      "reward_std": 0.022408843622542918,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.4921875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 417
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 540.75,
      "completions/max_terminated_length": 540.75,
      "completions/mean_length": 373.71875,
      "completions/mean_terminated_length": 373.71875,
      "completions/min_length": 242.25,
      "completions/min_terminated_length": 242.25,
      "epoch": 0.13933333333333334,
      "grad_norm": 0.5801618099212646,
      "kl": 0.49365234375,
      "learning_rate": 1.9907478404714438e-05,
      "loss": 0.0064,
      "num_tokens": 21002035.0,
      "reward": 0.6189446747303009,
      "reward_std": 0.22255618683993816,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.07415299117565155,
      "rewards/penalized_accuracy_reward/std": 0.20262455940246582,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.5,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 418
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 617.0,
      "completions/max_terminated_length": 617.0,
      "completions/mean_length": 421.625,
      "completions/mean_terminated_length": 421.625,
      "completions/min_length": 219.0,
      "completions/min_terminated_length": 219.0,
      "epoch": 0.13966666666666666,
      "grad_norm": 0.6660025119781494,
      "kl": 0.498046875,
      "learning_rate": 1.9905892576035798e-05,
      "loss": -0.0255,
      "num_tokens": 21038299.0,
      "reward": 0.5717447996139526,
      "reward_std": 0.123722143471241,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.02734375,
      "rewards/penalized_accuracy_reward/std": 0.1093750074505806,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.028463751077651978,
      "rewards/tag_count_reward/mean": 0.49609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 419
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 701.5,
      "completions/max_terminated_length": 629.5,
      "completions/mean_length": 463.1875,
      "completions/mean_terminated_length": 455.28334045410156,
      "completions/min_length": 289.5,
      "completions/min_terminated_length": 289.5,
      "epoch": 0.14,
      "grad_norm": 0.7708501815795898,
      "kl": 0.51708984375,
      "learning_rate": 1.9904293336214508e-05,
      "loss": 0.0814,
      "num_tokens": 21078743.0,
      "reward": 0.548828125,
      "reward_std": 0.0036972808884456754,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.48828125,
      "rewards/tag_count_reward/std": 0.03697281517088413,
      "step": 420
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 766.0,
      "completions/max_terminated_length": 710.0,
      "completions/mean_length": 469.25,
      "completions/mean_terminated_length": 462.2708435058594,
      "completions/min_length": 277.75,
      "completions/min_terminated_length": 277.75,
      "epoch": 0.14033333333333334,
      "grad_norm": 0.7420164346694946,
      "kl": 0.5908203125,
      "learning_rate": 1.9902680687415704e-05,
      "loss": 0.0502,
      "num_tokens": 21118359.0,
      "reward": 0.7088874876499176,
      "reward_std": 0.2456548601621762,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.15927812457084656,
      "rewards/penalized_accuracy_reward/std": 0.24409236013889313,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.49609375,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 421
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 708.75,
      "completions/max_terminated_length": 605.25,
      "completions/mean_length": 409.140625,
      "completions/mean_terminated_length": 400.08959197998047,
      "completions/min_length": 226.75,
      "completions/min_terminated_length": 226.75,
      "epoch": 0.14066666666666666,
      "grad_norm": 1.2998566627502441,
      "kl": 0.611328125,
      "learning_rate": 1.990105463182268e-05,
      "loss": 0.0206,
      "num_tokens": 21154512.0,
      "reward": 0.5515625029802322,
      "reward_std": 0.008713944582268596,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.515625,
      "rewards/tag_count_reward/std": 0.08713950775563717,
      "step": 422
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 775.75,
      "completions/max_terminated_length": 679.75,
      "completions/mean_length": 483.9375,
      "completions/mean_terminated_length": 475.6291732788086,
      "completions/min_length": 239.25,
      "completions/min_terminated_length": 239.25,
      "epoch": 0.141,
      "grad_norm": 1.5227426290512085,
      "kl": 0.53173828125,
      "learning_rate": 1.989941517163688e-05,
      "loss": 0.0387,
      "num_tokens": 21195212.0,
      "reward": 0.5609374940395355,
      "reward_std": 0.01383043429814279,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.609375,
      "rewards/tag_count_reward/std": 0.13830446638166904,
      "step": 423
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 869.75,
      "completions/max_terminated_length": 859.0,
      "completions/mean_length": 622.78125,
      "completions/mean_terminated_length": 596.5625,
      "completions/min_length": 367.5,
      "completions/min_terminated_length": 367.5,
      "epoch": 0.14133333333333334,
      "grad_norm": 1.3768945932388306,
      "kl": 0.732421875,
      "learning_rate": 1.989776230907789e-05,
      "loss": 0.0866,
      "num_tokens": 21245006.0,
      "reward": 0.5614583343267441,
      "reward_std": 0.01999457157216966,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.640625,
      "rewards/tag_count_reward/std": 0.13019821606576443,
      "step": 424
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 941.25,
      "completions/max_terminated_length": 793.0,
      "completions/mean_length": 609.265625,
      "completions/mean_terminated_length": 558.4790954589844,
      "completions/min_length": 410.75,
      "completions/min_terminated_length": 410.75,
      "epoch": 0.14166666666666666,
      "grad_norm": 2.4814653396606445,
      "kl": 1.353515625,
      "learning_rate": 1.9896096046383456e-05,
      "loss": 0.1494,
      "num_tokens": 21296671.0,
      "reward": 0.5548176914453506,
      "reward_std": 0.03875686880201101,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.05442607030272484,
      "rewards/tag_count_reward/mean": 0.65234375,
      "rewards/tag_count_reward/std": 0.18830689042806625,
      "step": 425
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.328125,
      "completions/max_length": 825.0,
      "completions/max_terminated_length": 609.0,
      "completions/mean_length": 642.125,
      "completions/mean_terminated_length": 458.1071472167969,
      "completions/min_length": 314.25,
      "completions/min_terminated_length": 314.25,
      "epoch": 0.142,
      "grad_norm": 3.4336211681365967,
      "kl": 4.107421875,
      "learning_rate": 1.9894416385809444e-05,
      "loss": 0.2316,
      "num_tokens": 21347255.0,
      "reward": 0.5065104141831398,
      "reward_std": 0.09406902268528938,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.911458358168602,
      "rewards/reasoning_steps_reward/std": 0.17035314068198204,
      "rewards/tag_count_reward/mean": 0.5078125,
      "rewards/tag_count_reward/std": 0.16901902854442596,
      "step": 426
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.453125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 688.0,
      "completions/mean_length": 735.015625,
      "completions/mean_terminated_length": 505.79168701171875,
      "completions/min_length": 331.75,
      "completions/min_terminated_length": 331.75,
      "epoch": 0.14233333333333334,
      "grad_norm": 10.202553749084473,
      "kl": 8.59375,
      "learning_rate": 1.9892723329629885e-05,
      "loss": 0.3914,
      "num_tokens": 21404472.0,
      "reward": 0.5317086279392242,
      "reward_std": 0.22374617960304022,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0341825895011425,
      "rewards/penalized_accuracy_reward/std": 0.13673035800457,
      "rewards/reasoning_steps_reward/mean": 0.9114583283662796,
      "rewards/reasoning_steps_reward/std": 0.22356067970395088,
      "rewards/tag_count_reward/mean": 0.41796875,
      "rewards/tag_count_reward/std": 0.2967885471880436,
      "step": 427
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 923.75,
      "completions/max_terminated_length": 847.25,
      "completions/mean_length": 603.96875,
      "completions/mean_terminated_length": 542.2116241455078,
      "completions/min_length": 326.5,
      "completions/min_terminated_length": 326.5,
      "epoch": 0.14266666666666666,
      "grad_norm": 6.315878868103027,
      "kl": 5.078125,
      "learning_rate": 1.9891016880136923e-05,
      "loss": 0.2559,
      "num_tokens": 21452550.0,
      "reward": 0.5242187529802322,
      "reward_std": 0.08881460968405008,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9218750149011612,
      "rewards/reasoning_steps_reward/std": 0.16595671698451042,
      "rewards/tag_count_reward/mean": 0.6328125,
      "rewards/tag_count_reward/std": 0.21946558356285095,
      "step": 428
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 941.5,
      "completions/max_terminated_length": 845.75,
      "completions/mean_length": 639.0625,
      "completions/mean_terminated_length": 571.6411590576172,
      "completions/min_length": 395.75,
      "completions/min_terminated_length": 395.75,
      "epoch": 0.143,
      "grad_norm": 4.72757625579834,
      "kl": 2.603515625,
      "learning_rate": 1.988929703964084e-05,
      "loss": 0.1405,
      "num_tokens": 21504650.0,
      "reward": 0.4993489533662796,
      "reward_std": 0.24587925523519516,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.02734375,
      "rewards/penalized_accuracy_reward/std": 0.1093750074505806,
      "rewards/reasoning_steps_reward/mean": 0.8229166865348816,
      "rewards/reasoning_steps_reward/std": 0.2813824266195297,
      "rewards/tag_count_reward/mean": 0.60546875,
      "rewards/tag_count_reward/std": 0.215969055891037,
      "step": 429
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 897.75,
      "completions/max_terminated_length": 872.5,
      "completions/mean_length": 695.03125,
      "completions/mean_terminated_length": 678.5364685058594,
      "completions/min_length": 488.0,
      "completions/min_terminated_length": 488.0,
      "epoch": 0.14333333333333334,
      "grad_norm": 1.2827024459838867,
      "kl": 0.5068359375,
      "learning_rate": 1.988756381047006e-05,
      "loss": 0.0198,
      "num_tokens": 21559788.0,
      "reward": 0.47968750447034836,
      "reward_std": 0.14802964963018894,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.8125000298023224,
      "rewards/reasoning_steps_reward/std": 0.2938147969543934,
      "rewards/tag_count_reward/mean": 0.734375,
      "rewards/tag_count_reward/std": 0.05920085124671459,
      "step": 430
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 965.5,
      "completions/max_terminated_length": 959.5,
      "completions/mean_length": 723.984375,
      "completions/mean_terminated_length": 719.9395904541016,
      "completions/min_length": 535.25,
      "completions/min_terminated_length": 535.25,
      "epoch": 0.14366666666666666,
      "grad_norm": 1.3184837102890015,
      "kl": 0.708984375,
      "learning_rate": 1.9885817194971116e-05,
      "loss": 0.0187,
      "num_tokens": 21615755.0,
      "reward": 0.5630208253860474,
      "reward_std": 0.03649181989021599,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.06615880131721497,
      "rewards/tag_count_reward/mean": 0.734375,
      "rewards/tag_count_reward/std": 0.049619100987911224,
      "step": 431
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 962.5,
      "completions/max_terminated_length": 951.25,
      "completions/mean_length": 810.46875,
      "completions/mean_terminated_length": 799.4933166503906,
      "completions/min_length": 590.5,
      "completions/min_terminated_length": 590.5,
      "epoch": 0.144,
      "grad_norm": 5.999351978302002,
      "kl": 1.052734375,
      "learning_rate": 1.9884057195508683e-05,
      "loss": 0.0095,
      "num_tokens": 21677961.0,
      "reward": 0.5640624910593033,
      "reward_std": 0.034836260601878166,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9843750149011612,
      "rewards/reasoning_steps_reward/std": 0.0624999962747097,
      "rewards/tag_count_reward/mean": 0.71875,
      "rewards/tag_count_reward/std": 0.08570349216461182,
      "step": 432
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.546875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1011.0,
      "completions/mean_length": 966.8125,
      "completions/mean_terminated_length": 907.8899078369141,
      "completions/min_length": 792.25,
      "completions/min_terminated_length": 792.25,
      "epoch": 0.14433333333333334,
      "grad_norm": 1.0965672731399536,
      "kl": 0.70361328125,
      "learning_rate": 1.988228381446553e-05,
      "loss": 0.0648,
      "num_tokens": 21754045.0,
      "reward": 0.705226257443428,
      "reward_std": 0.23013893724419177,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.14350752532482147,
      "rewards/penalized_accuracy_reward/std": 0.22008824348449707,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.6171875,
      "rewards/tag_count_reward/std": 0.12455067038536072,
      "step": 433
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.671875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 755.0,
      "completions/mean_length": 981.5625,
      "completions/mean_terminated_length": 658.8361206054688,
      "completions/min_length": 798.25,
      "completions/min_terminated_length": 542.25,
      "epoch": 0.14466666666666667,
      "grad_norm": 1.322916030883789,
      "kl": 1.2490234375,
      "learning_rate": 1.9880497054242566e-05,
      "loss": 0.0701,
      "num_tokens": 21827137.0,
      "reward": 0.5509114414453506,
      "reward_std": 0.023322110762819648,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.53515625,
      "rewards/tag_count_reward/std": 0.1606542058289051,
      "step": 434
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 997.75,
      "completions/mean_length": 957.296875,
      "completions/mean_terminated_length": 908.6064605712891,
      "completions/min_length": 767.0,
      "completions/min_terminated_length": 767.0,
      "epoch": 0.145,
      "grad_norm": 0.9156481623649597,
      "kl": 0.80517578125,
      "learning_rate": 1.987869691725881e-05,
      "loss": 0.0678,
      "num_tokens": 21899508.0,
      "reward": 0.5598958283662796,
      "reward_std": 0.02003988972865045,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.625,
      "rewards/tag_count_reward/std": 0.14209389127790928,
      "step": 435
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.421875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 1000.0,
      "completions/mean_length": 930.546875,
      "completions/mean_terminated_length": 858.4285888671875,
      "completions/min_length": 634.75,
      "completions/min_terminated_length": 634.75,
      "epoch": 0.14533333333333334,
      "grad_norm": 1.8983979225158691,
      "kl": 1.8017578125,
      "learning_rate": 1.9876883405951378e-05,
      "loss": 0.1319,
      "num_tokens": 21967799.0,
      "reward": 0.5892978459596634,
      "reward_std": 0.15209323493763804,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0330478698015213,
      "rewards/penalized_accuracy_reward/std": 0.1321914792060852,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.5625,
      "rewards/tag_count_reward/std": 0.230451051145792,
      "step": 436
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.328125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 989.25,
      "completions/mean_length": 883.671875,
      "completions/mean_terminated_length": 824.1293487548828,
      "completions/min_length": 608.0,
      "completions/min_terminated_length": 608.0,
      "epoch": 0.14566666666666667,
      "grad_norm": 1.28177809715271,
      "kl": 2.025390625,
      "learning_rate": 1.9875056522775506e-05,
      "loss": 0.0907,
      "num_tokens": 22034738.0,
      "reward": 0.6134114563465118,
      "reward_std": 0.1763192261569202,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0546875,
      "rewards/penalized_accuracy_reward/std": 0.14943470060825348,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.61328125,
      "rewards/tag_count_reward/std": 0.20768004097044468,
      "step": 437
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.390625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 976.5,
      "completions/mean_length": 882.546875,
      "completions/mean_terminated_length": 830.4378967285156,
      "completions/min_length": 678.0,
      "completions/min_terminated_length": 678.0,
      "epoch": 0.146,
      "grad_norm": 2.7501630783081055,
      "kl": 2.984375,
      "learning_rate": 1.987321627020453e-05,
      "loss": 0.1544,
      "num_tokens": 22103237.0,
      "reward": 0.6039062291383743,
      "reward_std": 0.190100381616503,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0546875,
      "rewards/penalized_accuracy_reward/std": 0.14943470060825348,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.0625,
      "rewards/tag_count_reward/mean": 0.5703125,
      "rewards/tag_count_reward/std": 0.21030498296022415,
      "step": 438
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.375,
      "completions/max_length": 1020.75,
      "completions/max_terminated_length": 994.0,
      "completions/mean_length": 893.90625,
      "completions/mean_terminated_length": 835.4974517822266,
      "completions/min_length": 621.5,
      "completions/min_terminated_length": 621.5,
      "epoch": 0.14633333333333334,
      "grad_norm": 2.1809208393096924,
      "kl": 2.060546875,
      "learning_rate": 1.987136265072988e-05,
      "loss": 0.1069,
      "num_tokens": 22170399.0,
      "reward": 0.5574218779802322,
      "reward_std": 0.023608210729435086,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.57421875,
      "rewards/tag_count_reward/std": 0.2360821943730116,
      "step": 439
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.46875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 966.75,
      "completions/mean_length": 886.671875,
      "completions/mean_terminated_length": 785.5016021728516,
      "completions/min_length": 618.5,
      "completions/min_terminated_length": 618.5,
      "epoch": 0.14666666666666667,
      "grad_norm": 2.85353946685791,
      "kl": 1.9375,
      "learning_rate": 1.9869495666861094e-05,
      "loss": 0.1311,
      "num_tokens": 22239306.0,
      "reward": 0.5869790464639664,
      "reward_std": 0.15728294849395752,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.03385403752326965,
      "rewards/penalized_accuracy_reward/std": 0.1354161649942398,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.53125,
      "rewards/tag_count_reward/std": 0.28653404489159584,
      "step": 440
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 983.5,
      "completions/mean_length": 866.765625,
      "completions/mean_terminated_length": 788.3967895507812,
      "completions/min_length": 599.0,
      "completions/min_terminated_length": 599.0,
      "epoch": 0.147,
      "grad_norm": 1.6231287717819214,
      "kl": 4.0546875,
      "learning_rate": 1.9867615321125796e-05,
      "loss": 0.2683,
      "num_tokens": 22307451.0,
      "reward": 0.5582031160593033,
      "reward_std": 0.025998273864388466,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.58203125,
      "rewards/tag_count_reward/std": 0.25998280197381973,
      "step": 441
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.34375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 819.90625,
      "completions/mean_terminated_length": 727.7122344970703,
      "completions/min_length": 473.0,
      "completions/min_terminated_length": 473.0,
      "epoch": 0.14733333333333334,
      "grad_norm": 3.4805145263671875,
      "kl": 5.953125,
      "learning_rate": 1.9865721616069695e-05,
      "loss": 0.3245,
      "num_tokens": 22370229.0,
      "reward": 0.580208346247673,
      "reward_std": 0.13673926563933492,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.02734375,
      "rewards/penalized_accuracy_reward/std": 0.1093750074505806,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.5546875,
      "rewards/tag_count_reward/std": 0.3218095973134041,
      "step": 442
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 959.25,
      "completions/max_terminated_length": 943.75,
      "completions/mean_length": 802.453125,
      "completions/mean_terminated_length": 749.4040222167969,
      "completions/min_length": 568.5,
      "completions/min_terminated_length": 568.5,
      "epoch": 0.14766666666666667,
      "grad_norm": 1.4201202392578125,
      "kl": 3.9248046875,
      "learning_rate": 1.98638145542566e-05,
      "loss": 0.1667,
      "num_tokens": 22431634.0,
      "reward": 0.5447916388511658,
      "reward_std": 0.0626852991990745,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.0727677047252655,
      "rewards/tag_count_reward/mean": 0.578125,
      "rewards/tag_count_reward/std": 0.2809402644634247,
      "step": 443
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 918.25,
      "completions/mean_length": 737.53125,
      "completions/mean_terminated_length": 705.8508605957031,
      "completions/min_length": 530.0,
      "completions/min_terminated_length": 530.0,
      "epoch": 0.148,
      "grad_norm": 1.6902104616165161,
      "kl": 3.552734375,
      "learning_rate": 1.9861894138268402e-05,
      "loss": 0.2042,
      "num_tokens": 22487748.0,
      "reward": 0.596875011920929,
      "reward_std": 0.12250060332007706,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.02734375,
      "rewards/penalized_accuracy_reward/std": 0.1093750074505806,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.6953125,
      "rewards/tag_count_reward/std": 0.1913830190896988,
      "step": 444
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.203125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 962.75,
      "completions/mean_length": 830.390625,
      "completions/mean_terminated_length": 790.3717346191406,
      "completions/min_length": 603.25,
      "completions/min_terminated_length": 603.25,
      "epoch": 0.14833333333333334,
      "grad_norm": 1.7514704465866089,
      "kl": 4.2451171875,
      "learning_rate": 1.985996037070505e-05,
      "loss": 0.1826,
      "num_tokens": 22551117.0,
      "reward": 0.5587239488959312,
      "reward_std": 0.20655486825853586,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.02734375,
      "rewards/penalized_accuracy_reward/std": 0.1093750074505806,
      "rewards/reasoning_steps_reward/mean": 0.9322916716337204,
      "rewards/reasoning_steps_reward/std": 0.169231366366148,
      "rewards/tag_count_reward/mean": 0.65234375,
      "rewards/tag_count_reward/std": 0.1948315743356943,
      "step": 445
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 958.0,
      "completions/max_terminated_length": 888.5,
      "completions/mean_length": 723.765625,
      "completions/mean_terminated_length": 691.06640625,
      "completions/min_length": 500.0,
      "completions/min_terminated_length": 500.0,
      "epoch": 0.14866666666666667,
      "grad_norm": 0.5837820172309875,
      "kl": 0.314453125,
      "learning_rate": 1.9858013254184597e-05,
      "loss": 0.0079,
      "num_tokens": 22606382.0,
      "reward": 0.5648437291383743,
      "reward_std": 0.01902891811914742,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.6484375,
      "rewards/tag_count_reward/std": 0.19028928130865097,
      "step": 446
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 942.5,
      "completions/mean_length": 836.140625,
      "completions/mean_terminated_length": 793.2091217041016,
      "completions/min_length": 600.75,
      "completions/min_terminated_length": 600.75,
      "epoch": 0.149,
      "grad_norm": 2.6721808910369873,
      "kl": 0.716796875,
      "learning_rate": 1.9856052791343153e-05,
      "loss": 0.0579,
      "num_tokens": 22670183.0,
      "reward": 0.5559895634651184,
      "reward_std": 0.054777587531134486,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.0833333320915699,
      "rewards/tag_count_reward/mean": 0.6640625,
      "rewards/tag_count_reward/std": 0.15826414339244366,
      "step": 447
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 965.5,
      "completions/max_terminated_length": 957.5,
      "completions/mean_length": 766.3125,
      "completions/mean_terminated_length": 736.1430358886719,
      "completions/min_length": 514.25,
      "completions/min_terminated_length": 514.25,
      "epoch": 0.14933333333333335,
      "grad_norm": 0.5376331210136414,
      "kl": 0.31591796875,
      "learning_rate": 1.9854078984834904e-05,
      "loss": -0.0108,
      "num_tokens": 22729643.0,
      "reward": 0.6521260887384415,
      "reward_std": 0.28587909252382815,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.08532924577593803,
      "rewards/penalized_accuracy_reward/std": 0.27568309009075165,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.66796875,
      "rewards/tag_count_reward/std": 0.19684339314699173,
      "step": 448
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 982.0,
      "completions/mean_length": 785.25,
      "completions/mean_terminated_length": 747.2240753173828,
      "completions/min_length": 515.75,
      "completions/min_terminated_length": 515.75,
      "epoch": 0.14966666666666667,
      "grad_norm": 1.1380079984664917,
      "kl": 0.7568359375,
      "learning_rate": 1.985209183733209e-05,
      "loss": 0.0213,
      "num_tokens": 22789771.0,
      "reward": 0.6175781190395355,
      "reward_std": 0.16514140227809548,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0546875,
      "rewards/penalized_accuracy_reward/std": 0.14943470060825348,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.62890625,
      "rewards/tag_count_reward/std": 0.25201747938990593,
      "step": 449
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.15625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 928.0,
      "completions/mean_length": 796.375,
      "completions/mean_terminated_length": 755.7070770263672,
      "completions/min_length": 554.75,
      "completions/min_terminated_length": 554.75,
      "epoch": 0.15,
      "grad_norm": 2.242668867111206,
      "kl": 0.46240234375,
      "learning_rate": 1.985009135152503e-05,
      "loss": 0.0553,
      "num_tokens": 22853443.0,
      "reward": 0.5953125059604645,
      "reward_std": 0.12160531315021217,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.02734375,
      "rewards/penalized_accuracy_reward/std": 0.1093750074505806,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.6796875,
      "rewards/tag_count_reward/std": 0.1715698577463627,
      "step": 450
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 1019.75,
      "completions/max_terminated_length": 991.75,
      "completions/mean_length": 812.578125,
      "completions/mean_terminated_length": 785.7073669433594,
      "completions/min_length": 603.75,
      "completions/min_terminated_length": 603.75,
      "epoch": 0.15033333333333335,
      "grad_norm": 3.406395435333252,
      "kl": 2.2548828125,
      "learning_rate": 1.9848077530122083e-05,
      "loss": 0.155,
      "num_tokens": 22917064.0,
      "reward": 0.5687500089406967,
      "reward_std": 0.015971079003065825,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.6875,
      "rewards/tag_count_reward/std": 0.1597108170390129,
      "step": 451
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.296875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 796.75,
      "completions/mean_length": 716.71875,
      "completions/mean_terminated_length": 598.7527770996094,
      "completions/min_length": 365.25,
      "completions/min_terminated_length": 365.25,
      "epoch": 0.15066666666666667,
      "grad_norm": 15.443132400512695,
      "kl": 14.4765625,
      "learning_rate": 1.9846050375849674e-05,
      "loss": 0.7602,
      "num_tokens": 22972918.0,
      "reward": 0.531901054084301,
      "reward_std": 0.07599062426015735,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9583333283662796,
      "rewards/reasoning_steps_reward/std": 0.10116107389330864,
      "rewards/tag_count_reward/mean": 0.52734375,
      "rewards/tag_count_reward/std": 0.3163977265357971,
      "step": 452
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.71875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 709.5,
      "completions/mean_length": 904.53125,
      "completions/mean_terminated_length": 589.0020980834961,
      "completions/min_length": 495.0,
      "completions/min_terminated_length": 495.0,
      "epoch": 0.151,
      "grad_norm": 85.37786865234375,
      "kl": 39.9375,
      "learning_rate": 1.984400989145228e-05,
      "loss": 1.6947,
      "num_tokens": 23040920.0,
      "reward": 0.4795573055744171,
      "reward_std": 0.13686126098036766,
      "rewards/format_reward/mean": 0.015625,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.895833358168602,
      "rewards/reasoning_steps_reward/std": 0.22046180069446564,
      "rewards/tag_count_reward/mean": 0.25390625,
      "rewards/tag_count_reward/std": 0.3303435668349266,
      "step": 453
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.65625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 767.5,
      "completions/mean_length": 869.421875,
      "completions/mean_terminated_length": 611.6889953613281,
      "completions/min_length": 506.25,
      "completions/min_terminated_length": 506.25,
      "epoch": 0.15133333333333332,
      "grad_norm": 55.974674224853516,
      "kl": 36.25,
      "learning_rate": 1.984195607969242e-05,
      "loss": 1.5795,
      "num_tokens": 23105427.0,
      "reward": 0.4514322876930237,
      "reward_std": 0.16104818508028984,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.8489583730697632,
      "rewards/reasoning_steps_reward/std": 0.2876831628382206,
      "rewards/tag_count_reward/mean": 0.26953125,
      "rewards/tag_count_reward/std": 0.33850353956222534,
      "step": 454
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.53125,
      "completions/max_length": 970.5,
      "completions/max_terminated_length": 839.25,
      "completions/mean_length": 828.875,
      "completions/mean_terminated_length": 645.25,
      "completions/min_length": 445.75,
      "completions/min_terminated_length": 445.75,
      "epoch": 0.15166666666666667,
      "grad_norm": 21.950531005859375,
      "kl": 15.4345703125,
      "learning_rate": 1.9839888943350656e-05,
      "loss": 0.7035,
      "num_tokens": 23169291.0,
      "reward": 0.5143229141831398,
      "reward_std": 0.11808320507407188,
      "rewards/format_reward/mean": 0.015625,
      "rewards/format_reward/std": 0.0625,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9427083432674408,
      "rewards/reasoning_steps_reward/std": 0.16406626999378204,
      "rewards/tag_count_reward/mean": 0.3671875,
      "rewards/tag_count_reward/std": 0.26722504384815693,
      "step": 455
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.453125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 626.75,
      "completions/mean_length": 802.34375,
      "completions/mean_terminated_length": 463.2026672363281,
      "completions/min_length": 539.5,
      "completions/min_terminated_length": 283.5,
      "epoch": 0.152,
      "grad_norm": 4.777839660644531,
      "kl": 4.240234375,
      "learning_rate": 1.983780848522559e-05,
      "loss": 0.2407,
      "num_tokens": 23232369.0,
      "reward": 0.5622395873069763,
      "reward_std": 0.1430051177740097,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.02734375,
      "rewards/penalized_accuracy_reward/std": 0.1093750074505806,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.03726779669523239,
      "rewards/tag_count_reward/mean": 0.453125,
      "rewards/tag_count_reward/std": 0.24603138118982315,
      "step": 456
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.28125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 967.5,
      "completions/mean_length": 747.3125,
      "completions/mean_terminated_length": 653.7940368652344,
      "completions/min_length": 431.0,
      "completions/min_terminated_length": 431.0,
      "epoch": 0.15233333333333332,
      "grad_norm": 4.815750598907471,
      "kl": 0.734375,
      "learning_rate": 1.983571470813386e-05,
      "loss": 0.1456,
      "num_tokens": 23289381.0,
      "reward": 0.6244791597127914,
      "reward_std": 0.16721098124980927,
      "rewards/format_reward/mean": 0.15625,
      "rewards/format_reward/std": 0.3529609143733978,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.0416666641831398,
      "rewards/tag_count_reward/mean": 0.671875,
      "rewards/tag_count_reward/std": 0.3039328083395958,
      "step": 457
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.5,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 776.75,
      "completions/mean_length": 774.171875,
      "completions/mean_terminated_length": 562.4405136108398,
      "completions/min_length": 318.75,
      "completions/min_terminated_length": 318.75,
      "epoch": 0.15266666666666667,
      "grad_norm": 6.34462308883667,
      "kl": 1.1328125,
      "learning_rate": 1.983360761491014e-05,
      "loss": 0.1067,
      "num_tokens": 23354496.0,
      "reward": 0.6123213768005371,
      "reward_std": 0.32740413025021553,
      "rewards/format_reward/mean": 0.15625,
      "rewards/format_reward/std": 0.3604728877544403,
      "rewards/penalized_accuracy_reward/mean": 0.05125368386507034,
      "rewards/penalized_accuracy_reward/std": 0.14041048288345337,
      "rewards/reasoning_steps_reward/mean": 0.9010416865348816,
      "rewards/reasoning_steps_reward/std": 0.2615289017558098,
      "rewards/tag_count_reward/mean": 0.48046875,
      "rewards/tag_count_reward/std": 0.36171500384807587,
      "step": 458
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.1875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 917.25,
      "completions/mean_length": 672.09375,
      "completions/mean_terminated_length": 595.3750228881836,
      "completions/min_length": 229.0,
      "completions/min_terminated_length": 229.0,
      "epoch": 0.153,
      "grad_norm": 4.3616862297058105,
      "kl": 0.9384765625,
      "learning_rate": 1.9831487208407126e-05,
      "loss": 0.1203,
      "num_tokens": 23405526.0,
      "reward": 0.6326136887073517,
      "reward_std": 0.2799901254475117,
      "rewards/format_reward/mean": 0.125,
      "rewards/format_reward/std": 0.28694770485162735,
      "rewards/penalized_accuracy_reward/mean": 0.03743140399456024,
      "rewards/penalized_accuracy_reward/std": 0.14972561597824097,
      "rewards/reasoning_steps_reward/mean": 0.9583333432674408,
      "rewards/reasoning_steps_reward/std": 0.13070852309465408,
      "rewards/tag_count_reward/mean": 0.66015625,
      "rewards/tag_count_reward/std": 0.31298641115427017,
      "step": 459
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 992.0,
      "completions/max_terminated_length": 894.0,
      "completions/mean_length": 654.8125,
      "completions/mean_terminated_length": 599.49853515625,
      "completions/min_length": 309.25,
      "completions/min_terminated_length": 309.25,
      "epoch": 0.15333333333333332,
      "grad_norm": 4.061674118041992,
      "kl": 1.5810546875,
      "learning_rate": 1.9829353491495545e-05,
      "loss": 0.1745,
      "num_tokens": 23459066.0,
      "reward": 0.7437500059604645,
      "reward_std": 0.255161315202713,
      "rewards/format_reward/mean": 0.453125,
      "rewards/format_reward/std": 0.5069767236709595,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.96875,
      "rewards/reasoning_steps_reward/std": 0.125,
      "rewards/tag_count_reward/mean": 0.78125,
      "rewards/tag_count_reward/std": 0.28599051013588905,
      "step": 460
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 977.25,
      "completions/max_terminated_length": 964.25,
      "completions/mean_length": 608.140625,
      "completions/mean_terminated_length": 590.8534393310547,
      "completions/min_length": 363.5,
      "completions/min_terminated_length": 363.5,
      "epoch": 0.15366666666666667,
      "grad_norm": 3.283140182495117,
      "kl": 1.380859375,
      "learning_rate": 1.9827206467064133e-05,
      "loss": 0.1406,
      "num_tokens": 23507267.0,
      "reward": 0.930468738079071,
      "reward_std": 0.15995598956942558,
      "rewards/format_reward/mean": 0.84375,
      "rewards/format_reward/std": 0.3723389655351639,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.9296875,
      "rewards/tag_count_reward/std": 0.1550494320690632,
      "step": 461
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1009.25,
      "completions/max_terminated_length": 921.0,
      "completions/mean_length": 664.59375,
      "completions/mean_terminated_length": 563.5781402587891,
      "completions/min_length": 139.75,
      "completions/min_terminated_length": 139.75,
      "epoch": 0.154,
      "grad_norm": 5.861054420471191,
      "kl": 7.4375,
      "learning_rate": 1.9825046138019658e-05,
      "loss": 0.3875,
      "num_tokens": 23560393.0,
      "reward": 0.7765624970197678,
      "reward_std": 0.28909046202898026,
      "rewards/format_reward/mean": 0.625,
      "rewards/format_reward/std": 0.46039126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.90625,
      "rewards/reasoning_steps_reward/std": 0.28414636105298996,
      "rewards/tag_count_reward/mean": 0.734375,
      "rewards/tag_count_reward/std": 0.29997236654162407,
      "step": 462
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 970.25,
      "completions/max_terminated_length": 860.25,
      "completions/mean_length": 560.21875,
      "completions/mean_terminated_length": 537.9114761352539,
      "completions/min_length": 206.5,
      "completions/min_terminated_length": 206.5,
      "epoch": 0.15433333333333332,
      "grad_norm": 2.371166467666626,
      "kl": 2.4482421875,
      "learning_rate": 1.982287250728689e-05,
      "loss": 0.0957,
      "num_tokens": 23607447.0,
      "reward": 0.8838541507720947,
      "reward_std": 0.19957701489329338,
      "rewards/format_reward/mean": 0.78125,
      "rewards/format_reward/std": 0.4000816270709038,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.1041666641831398,
      "rewards/tag_count_reward/mean": 0.84375,
      "rewards/tag_count_reward/std": 0.26904767379164696,
      "step": 463
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 893.5,
      "completions/max_terminated_length": 893.5,
      "completions/mean_length": 615.984375,
      "completions/mean_terminated_length": 615.984375,
      "completions/min_length": 364.0,
      "completions/min_terminated_length": 364.0,
      "epoch": 0.15466666666666667,
      "grad_norm": 1.8780430555343628,
      "kl": 1.3984375,
      "learning_rate": 1.9820685577808604e-05,
      "loss": 0.1123,
      "num_tokens": 23656662.0,
      "reward": 0.9496093541383743,
      "reward_std": 0.12988915853202343,
      "rewards/format_reward/mean": 0.890625,
      "rewards/format_reward/std": 0.31116948276758194,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.93359375,
      "rewards/tag_count_reward/std": 0.1318533569574356,
      "step": 464
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.046875,
      "completions/max_length": 976.75,
      "completions/max_terminated_length": 786.5,
      "completions/mean_length": 611.21875,
      "completions/mean_terminated_length": 590.4094009399414,
      "completions/min_length": 366.0,
      "completions/min_terminated_length": 366.0,
      "epoch": 0.155,
      "grad_norm": 1.3947129249572754,
      "kl": 3.5634765625,
      "learning_rate": 1.9818485352545595e-05,
      "loss": 0.2216,
      "num_tokens": 23706036.0,
      "reward": 0.8876301944255829,
      "reward_std": 0.18621249124407768,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.41898179799318314,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9947916716337204,
      "rewards/reasoning_steps_reward/std": 0.0208333320915699,
      "rewards/tag_count_reward/mean": 0.90234375,
      "rewards/tag_count_reward/std": 0.17755008302628994,
      "step": 465
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 928.25,
      "completions/max_terminated_length": 807.25,
      "completions/mean_length": 504.75,
      "completions/mean_terminated_length": 485.90313720703125,
      "completions/min_length": 205.0,
      "completions/min_terminated_length": 205.0,
      "epoch": 0.15533333333333332,
      "grad_norm": 2.718474864959717,
      "kl": 3.38525390625,
      "learning_rate": 1.9816271834476642e-05,
      "loss": 0.1876,
      "num_tokens": 23751172.0,
      "reward": 0.8970052301883698,
      "reward_std": 0.2223619632422924,
      "rewards/format_reward/mean": 0.828125,
      "rewards/format_reward/std": 0.38336414843797684,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9479166716337204,
      "rewards/reasoning_steps_reward/std": 0.14994098246097565,
      "rewards/tag_count_reward/mean": 0.91796875,
      "rewards/tag_count_reward/std": 0.19615886360406876,
      "step": 466
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.234375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 838.0,
      "completions/mean_length": 629.109375,
      "completions/mean_terminated_length": 511.8250274658203,
      "completions/min_length": 223.0,
      "completions/min_terminated_length": 223.0,
      "epoch": 0.15566666666666668,
      "grad_norm": 14.512189865112305,
      "kl": 15.4140625,
      "learning_rate": 1.981404502659853e-05,
      "loss": 0.7657,
      "num_tokens": 23803083.0,
      "reward": 0.7504923716187477,
      "reward_std": 0.5557838007807732,
      "rewards/format_reward/mean": 0.5,
      "rewards/format_reward/std": 0.46566852182149887,
      "rewards/penalized_accuracy_reward/mean": 0.0932006947696209,
      "rewards/penalized_accuracy_reward/std": 0.3028942197561264,
      "rewards/reasoning_steps_reward/mean": 0.7708333432674408,
      "rewards/reasoning_steps_reward/std": 0.3734225407242775,
      "rewards/tag_count_reward/mean": 0.71875,
      "rewards/tag_count_reward/std": 0.3007545731961727,
      "step": 467
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.21875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 592.25,
      "completions/mean_length": 527.34375,
      "completions/mean_terminated_length": 389.0742645263672,
      "completions/min_length": 180.0,
      "completions/min_terminated_length": 180.0,
      "epoch": 0.156,
      "grad_norm": 12.772680282592773,
      "kl": 17.3203125,
      "learning_rate": 1.981180493192603e-05,
      "loss": 0.9476,
      "num_tokens": 23849377.0,
      "reward": 0.6187500059604645,
      "reward_std": 0.3620525300502777,
      "rewards/format_reward/mean": 0.421875,
      "rewards/format_reward/std": 0.500852182507515,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.765625,
      "rewards/reasoning_steps_reward/std": 0.37763215601444244,
      "rewards/tag_count_reward/mean": 0.671875,
      "rewards/tag_count_reward/std": 0.33192089945077896,
      "step": 468
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 896.25,
      "completions/max_terminated_length": 853.25,
      "completions/mean_length": 513.234375,
      "completions/mean_terminated_length": 487.4890365600586,
      "completions/min_length": 267.5,
      "completions/min_terminated_length": 267.5,
      "epoch": 0.15633333333333332,
      "grad_norm": 2.41903018951416,
      "kl": 1.72265625,
      "learning_rate": 1.9809551553491918e-05,
      "loss": 0.103,
      "num_tokens": 23893920.0,
      "reward": 0.8069010525941849,
      "reward_std": 0.2257729135453701,
      "rewards/format_reward/mean": 0.625,
      "rewards/format_reward/std": 0.47360680997371674,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.08538305386900902,
      "rewards/tag_count_reward/mean": 0.69921875,
      "rewards/tag_count_reward/std": 0.27607953548431396,
      "step": 469
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0,
      "completions/max_length": 673.5,
      "completions/max_terminated_length": 673.5,
      "completions/mean_length": 471.96875,
      "completions/mean_terminated_length": 471.96875,
      "completions/min_length": 249.5,
      "completions/min_terminated_length": 249.5,
      "epoch": 0.15666666666666668,
      "grad_norm": 1.0782326459884644,
      "kl": 0.48193359375,
      "learning_rate": 1.980728489434693e-05,
      "loss": 0.0287,
      "num_tokens": 23933182.0,
      "reward": 0.9272077232599258,
      "reward_std": 0.27451132610440254,
      "rewards/format_reward/mean": 0.765625,
      "rewards/format_reward/std": 0.4079566150903702,
      "rewards/penalized_accuracy_reward/mean": 0.030723346397280693,
      "rewards/penalized_accuracy_reward/std": 0.12289339303970337,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.90234375,
      "rewards/tag_count_reward/std": 0.16636842116713524,
      "step": 470
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 990.25,
      "completions/max_terminated_length": 668.25,
      "completions/mean_length": 426.734375,
      "completions/mean_terminated_length": 386.1576042175293,
      "completions/min_length": 210.0,
      "completions/min_terminated_length": 210.0,
      "epoch": 0.157,
      "grad_norm": 2.5335872173309326,
      "kl": 3.2294921875,
      "learning_rate": 1.9805004957559795e-05,
      "loss": 0.1511,
      "num_tokens": 23971037.0,
      "reward": 0.8740885406732559,
      "reward_std": 0.20123326405882835,
      "rewards/format_reward/mean": 0.75,
      "rewards/format_reward/std": 0.4440634250640869,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9739583432674408,
      "rewards/reasoning_steps_reward/std": 0.08538305386900902,
      "rewards/tag_count_reward/mean": 0.87109375,
      "rewards/tag_count_reward/std": 0.22462255880236626,
      "step": 471
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 839.0,
      "completions/max_terminated_length": 788.5,
      "completions/mean_length": 539.578125,
      "completions/mean_terminated_length": 532.3395843505859,
      "completions/min_length": 322.0,
      "completions/min_terminated_length": 322.0,
      "epoch": 0.15733333333333333,
      "grad_norm": 4.432992935180664,
      "kl": 1.138671875,
      "learning_rate": 1.9802711746217222e-05,
      "loss": 0.032,
      "num_tokens": 24015810.0,
      "reward": 1.0317133069038391,
      "reward_std": 0.2996555743739009,
      "rewards/format_reward/mean": 0.921875,
      "rewards/format_reward/std": 0.23328252136707306,
      "rewards/penalized_accuracy_reward/mean": 0.07611434161663055,
      "rewards/penalized_accuracy_reward/std": 0.20798414945602417,
      "rewards/reasoning_steps_reward/mean": 0.9791666716337204,
      "rewards/reasoning_steps_reward/std": 0.0833333320915699,
      "rewards/tag_count_reward/mean": 0.97265625,
      "rewards/tag_count_reward/std": 0.07779237069189548,
      "step": 472
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.890625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 177.25,
      "completions/mean_length": 931.671875,
      "completions/mean_terminated_length": 107.75,
      "completions/min_length": 313.0,
      "completions/min_terminated_length": 57.0,
      "epoch": 0.15766666666666668,
      "grad_norm": 55.799678802490234,
      "kl": 27.28125,
      "learning_rate": 1.980040526342388e-05,
      "loss": 1.1767,
      "num_tokens": 24085693.0,
      "reward": 0.2610677070915699,
      "reward_std": 0.2550016790628433,
      "rewards/format_reward/mean": 0.046875,
      "rewards/format_reward/std": 0.14789126068353653,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.416666679084301,
      "rewards/reasoning_steps_reward/std": 0.4453532323241234,
      "rewards/tag_count_reward/mean": 0.33984375,
      "rewards/tag_count_reward/std": 0.24816539511084557,
      "step": 473
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.984375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.25,
      "completions/mean_length": 1008.015625,
      "completions/mean_terminated_length": 0.25,
      "completions/min_length": 768.25,
      "completions/min_terminated_length": 0.25,
      "epoch": 0.158,
      "grad_norm": 48.84685516357422,
      "kl": 19.625,
      "learning_rate": 1.9798085512302418e-05,
      "loss": 0.7711,
      "num_tokens": 24160318.0,
      "reward": 0.12356770969927311,
      "reward_std": 0.14149870537221432,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.2135416716337204,
      "rewards/reasoning_steps_reward/std": 0.27610698342323303,
      "rewards/tag_count_reward/mean": 0.16796875,
      "rewards/tag_count_reward/std": 0.12979382276535034,
      "step": 474
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.984375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 230.5,
      "completions/mean_length": 1022.40625,
      "completions/mean_terminated_length": 230.5,
      "completions/min_length": 998.5,
      "completions/min_terminated_length": 230.5,
      "epoch": 0.15833333333333333,
      "grad_norm": 25.87973976135254,
      "kl": 9.75,
      "learning_rate": 1.979575249599344e-05,
      "loss": 0.3893,
      "num_tokens": 24236520.0,
      "reward": 0.10117187723517418,
      "reward_std": 0.12787336483597755,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.1718750037252903,
      "rewards/reasoning_steps_reward/std": 0.24886878952383995,
      "rewards/tag_count_reward/mean": 0.15234375,
      "rewards/tag_count_reward/std": 0.1389313079416752,
      "step": 475
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.15866666666666668,
      "grad_norm": 9.469910621643066,
      "kl": 3.26953125,
      "learning_rate": 1.9793406217655516e-05,
      "loss": 0.1307,
      "num_tokens": 24314728.0,
      "reward": 0.08541666809469461,
      "reward_std": 0.1373102180659771,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.1458333320915699,
      "rewards/reasoning_steps_reward/std": 0.2653312236070633,
      "rewards/tag_count_reward/mean": 0.125,
      "rewards/tag_count_reward/std": 0.12704972177743912,
      "step": 476
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.159,
      "grad_norm": 1.5597959756851196,
      "kl": 0.5771484375,
      "learning_rate": 1.979104668046516e-05,
      "loss": 0.0231,
      "num_tokens": 24389032.0,
      "reward": 0.08945313096046448,
      "reward_std": 0.12571851909160614,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.15625000558793545,
      "rewards/reasoning_steps_reward/std": 0.2400699369609356,
      "rewards/tag_count_reward/mean": 0.11328125,
      "rewards/tag_count_reward/std": 0.12099427729845047,
      "step": 477
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.15933333333333333,
      "grad_norm": 0.8009725213050842,
      "kl": 0.168701171875,
      "learning_rate": 1.9788673887616852e-05,
      "loss": 0.0067,
      "num_tokens": 24464520.0,
      "reward": 0.13242188096046448,
      "reward_std": 0.12582270056009293,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.2343750186264515,
      "rewards/reasoning_steps_reward/std": 0.23368741944432259,
      "rewards/tag_count_reward/mean": 0.15234375,
      "rewards/tag_count_reward/std": 0.11941073834896088,
      "step": 478
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.15966666666666668,
      "grad_norm": 0.558620035648346,
      "kl": 0.153076171875,
      "learning_rate": 1.9786287842323002e-05,
      "loss": 0.0061,
      "num_tokens": 24541576.0,
      "reward": 0.21380207687616348,
      "reward_std": 0.12126775458455086,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.385416679084301,
      "rewards/reasoning_steps_reward/std": 0.23433792963624,
      "rewards/tag_count_reward/mean": 0.2109375,
      "rewards/tag_count_reward/std": 0.07206955552101135,
      "step": 479
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16,
      "grad_norm": 0.27308619022369385,
      "kl": 0.1376953125,
      "learning_rate": 1.978388854781397e-05,
      "loss": 0.0055,
      "num_tokens": 24616664.0,
      "reward": 0.3531249910593033,
      "reward_std": 0.1377511229366064,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.6562500149011612,
      "rewards/reasoning_steps_reward/std": 0.2755022719502449,
      "rewards/tag_count_reward/mean": 0.25,
      "rewards/tag_count_reward/std": 0.0,
      "step": 480
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16033333333333333,
      "grad_norm": 0.35946178436279297,
      "kl": 0.1312255859375,
      "learning_rate": 1.9781476007338058e-05,
      "loss": 0.0053,
      "num_tokens": 24691752.0,
      "reward": 0.4755208194255829,
      "reward_std": 0.09098472259938717,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9010416865348816,
      "rewards/reasoning_steps_reward/std": 0.18196947127580643,
      "rewards/tag_count_reward/mean": 0.25,
      "rewards/tag_count_reward/std": 0.0,
      "step": 481
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16066666666666668,
      "grad_norm": 0.09131309390068054,
      "kl": 0.1270751953125,
      "learning_rate": 1.9779050224161494e-05,
      "loss": 0.0051,
      "num_tokens": 24767704.0,
      "reward": 0.5249999761581421,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.25,
      "rewards/tag_count_reward/std": 0.0,
      "step": 482
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.161,
      "grad_norm": 0.08760453760623932,
      "kl": 0.1317138671875,
      "learning_rate": 1.9776611201568434e-05,
      "loss": 0.0053,
      "num_tokens": 24847144.0,
      "reward": 0.5249999761581421,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.25,
      "rewards/tag_count_reward/std": 0.0,
      "step": 483
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16133333333333333,
      "grad_norm": 0.23719899356365204,
      "kl": 0.150146484375,
      "learning_rate": 1.9774158942860962e-05,
      "loss": 0.006,
      "num_tokens": 24921944.0,
      "reward": 0.5164062231779099,
      "reward_std": 0.031114285811781883,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.984375,
      "rewards/reasoning_steps_reward/std": 0.0625,
      "rewards/tag_count_reward/mean": 0.2421875,
      "rewards/tag_count_reward/std": 0.021347815170884132,
      "step": 484
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16166666666666665,
      "grad_norm": 0.13751773536205292,
      "kl": 0.21533203125,
      "learning_rate": 1.977169345135908e-05,
      "loss": 0.0086,
      "num_tokens": 24995352.0,
      "reward": 0.5249999761581421,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.25,
      "rewards/tag_count_reward/std": 0.0,
      "step": 485
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.162,
      "grad_norm": 0.14502756297588348,
      "kl": 0.246337890625,
      "learning_rate": 1.976921473040071e-05,
      "loss": 0.0099,
      "num_tokens": 25070408.0,
      "reward": 0.5249999761581421,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.25,
      "rewards/tag_count_reward/std": 0.0,
      "step": 486
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16233333333333333,
      "grad_norm": 0.2363901138305664,
      "kl": 0.29052734375,
      "learning_rate": 1.9766722783341682e-05,
      "loss": 0.0116,
      "num_tokens": 25143960.0,
      "reward": 0.5249999761581421,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.25,
      "rewards/tag_count_reward/std": 0.0,
      "step": 487
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16266666666666665,
      "grad_norm": 1.5923470258712769,
      "kl": 0.35986328125,
      "learning_rate": 1.976421761355572e-05,
      "loss": 0.0144,
      "num_tokens": 25218552.0,
      "reward": 0.5253905951976776,
      "reward_std": 0.0015625039814040065,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.25390625,
      "rewards/tag_count_reward/std": 0.015625,
      "step": 488
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.163,
      "grad_norm": 0.37030932307243347,
      "kl": 0.3203125,
      "learning_rate": 1.9761699224434476e-05,
      "loss": 0.0128,
      "num_tokens": 25295160.0,
      "reward": 0.524218738079071,
      "reward_std": 0.0031250000465661287,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.2421875,
      "rewards/tag_count_reward/std": 0.03125,
      "step": 489
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.640625,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 730.0,
      "completions/mean_length": 827.359375,
      "completions/mean_terminated_length": 470.5062484741211,
      "completions/min_length": 293.0,
      "completions/min_terminated_length": 293.0,
      "epoch": 0.16333333333333333,
      "grad_norm": 0.975154459476471,
      "kl": 0.38525390625,
      "learning_rate": 1.9759167619387474e-05,
      "loss": 0.2072,
      "num_tokens": 25360095.0,
      "reward": 0.6637881994247437,
      "reward_std": 0.2572151683270931,
      "rewards/format_reward/mean": 0.25,
      "rewards/format_reward/std": 0.43655145168304443,
      "rewards/penalized_accuracy_reward/mean": 0.024725718423724174,
      "rewards/penalized_accuracy_reward/std": 0.09890288859605789,
      "rewards/reasoning_steps_reward/mean": 1.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.390625,
      "rewards/tag_count_reward/std": 0.23340947180986404,
      "step": 490
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 414.5,
      "completions/mean_length": 327.515625,
      "completions/mean_terminated_length": 255.17144012451172,
      "completions/min_length": 142.5,
      "completions/min_terminated_length": 142.5,
      "epoch": 0.16366666666666665,
      "grad_norm": 2.5863473415374756,
      "kl": 0.46240234375,
      "learning_rate": 1.9756622801842144e-05,
      "loss": 0.6562,
      "num_tokens": 25391056.0,
      "reward": 0.9365885406732559,
      "reward_std": 0.14823532104492188,
      "rewards/format_reward/mean": 0.90625,
      "rewards/format_reward/std": 0.29578252136707306,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9895833432674408,
      "rewards/reasoning_steps_reward/std": 0.028463751077651978,
      "rewards/tag_count_reward/mean": 0.79296875,
      "rewards/tag_count_reward/std": 0.37085768580436707,
      "step": 491
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.328125,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 427.5,
      "completions/mean_length": 460.296875,
      "completions/mean_terminated_length": 182.46656799316406,
      "completions/min_length": 59.25,
      "completions/min_terminated_length": 59.25,
      "epoch": 0.164,
      "grad_norm": 2.3596835136413574,
      "kl": 0.416015625,
      "learning_rate": 1.9754064775243797e-05,
      "loss": 0.6765,
      "num_tokens": 25431619.0,
      "reward": 0.7888020873069763,
      "reward_std": 0.22947583347558975,
      "rewards/format_reward/mean": 0.609375,
      "rewards/format_reward/std": 0.4819520115852356,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.9635416865348816,
      "rewards/reasoning_steps_reward/std": 0.11545588448643684,
      "rewards/tag_count_reward/mean": 0.6328125,
      "rewards/tag_count_reward/std": 0.3889954835176468,
      "step": 492
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 0.296875,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 177.0,
      "completions/mean_length": 354.59375,
      "completions/mean_terminated_length": 70.25404167175293,
      "completions/min_length": 22.25,
      "completions/min_terminated_length": 22.25,
      "epoch": 0.16433333333333333,
      "grad_norm": 6.96124267578125,
      "kl": 0.9287109375,
      "learning_rate": 1.9751493543055634e-05,
      "loss": 0.326,
      "num_tokens": 25464729.0,
      "reward": 0.5072916820645332,
      "reward_std": 0.27873288094997406,
      "rewards/format_reward/mean": 0.609375,
      "rewards/format_reward/std": 0.48605145514011383,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.3802083432674408,
      "rewards/reasoning_steps_reward/std": 0.4222742021083832,
      "rewards/tag_count_reward/mean": 0.734375,
      "rewards/tag_count_reward/std": 0.30691851302981377,
      "step": 493
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16466666666666666,
      "grad_norm": 6.878725051879883,
      "kl": 4.8125,
      "learning_rate": 1.9748909108758727e-05,
      "loss": 0.1925,
      "num_tokens": 25541753.0,
      "reward": 0.17213542014360428,
      "reward_std": 0.18678832054138184,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.3177083507180214,
      "rewards/reasoning_steps_reward/std": 0.3730955421924591,
      "rewards/tag_count_reward/mean": 0.1328125,
      "rewards/tag_count_reward/std": 0.11971627920866013,
      "step": 494
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.165,
      "grad_norm": 2.3972485065460205,
      "kl": 1.1015625,
      "learning_rate": 1.9746311475852028e-05,
      "loss": 0.0441,
      "num_tokens": 25618601.0,
      "reward": 0.18919270858168602,
      "reward_std": 0.2180721014738083,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.3697916865348816,
      "rewards/reasoning_steps_reward/std": 0.4412023276090622,
      "rewards/tag_count_reward/mean": 0.04296875,
      "rewards/tag_count_reward/std": 0.08957063034176826,
      "step": 495
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16533333333333333,
      "grad_norm": 0.005385370459407568,
      "kl": 0.049560546875,
      "learning_rate": 1.9743700647852356e-05,
      "loss": 0.002,
      "num_tokens": 25693913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 496
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16566666666666666,
      "grad_norm": 0.0007258623954840004,
      "kl": 0.04669189453125,
      "learning_rate": 1.9741076628294387e-05,
      "loss": 0.0019,
      "num_tokens": 25769449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 497
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.166,
      "grad_norm": 4.1728949327080045e-06,
      "kl": 0.044189453125,
      "learning_rate": 1.9738439420730674e-05,
      "loss": 0.0018,
      "num_tokens": 25847465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 498
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16633333333333333,
      "grad_norm": 3.0347960091603454e-06,
      "kl": 0.04339599609375,
      "learning_rate": 1.9735789028731603e-05,
      "loss": 0.0017,
      "num_tokens": 25925305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 499
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16666666666666666,
      "grad_norm": 2.655534899531631e-06,
      "kl": 0.04791259765625,
      "learning_rate": 1.973312545588543e-05,
      "loss": 0.0019,
      "num_tokens": 25999129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 500
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.167,
      "grad_norm": 3.536061512932065e-06,
      "kl": 0.046630859375,
      "learning_rate": 1.973044870579824e-05,
      "loss": 0.0019,
      "num_tokens": 26074825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 501
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16733333333333333,
      "grad_norm": 2.505825477783219e-06,
      "kl": 0.0421142578125,
      "learning_rate": 1.972775878209397e-05,
      "loss": 0.0017,
      "num_tokens": 26151913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 502
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16766666666666666,
      "grad_norm": 6.029501037119189e-06,
      "kl": 0.0399169921875,
      "learning_rate": 1.9725055688414378e-05,
      "loss": 0.0016,
      "num_tokens": 26234457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 503
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.168,
      "grad_norm": 4.605010872182902e-06,
      "kl": 0.04815673828125,
      "learning_rate": 1.972233942841907e-05,
      "loss": 0.0019,
      "num_tokens": 26310425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 504
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16833333333333333,
      "grad_norm": 9.006074833450839e-06,
      "kl": 0.0472412109375,
      "learning_rate": 1.9719610005785466e-05,
      "loss": 0.0019,
      "num_tokens": 26388953.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 505
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16866666666666666,
      "grad_norm": 0.0003149510594084859,
      "kl": 0.04443359375,
      "learning_rate": 1.9716867424208805e-05,
      "loss": 0.0018,
      "num_tokens": 26470969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 506
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.169,
      "grad_norm": 5.774768396804575e-06,
      "kl": 0.04522705078125,
      "learning_rate": 1.9714111687402146e-05,
      "loss": 0.0018,
      "num_tokens": 26547817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 507
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16933333333333334,
      "grad_norm": 5.529911049961811e-06,
      "kl": 0.04559326171875,
      "learning_rate": 1.971134279909636e-05,
      "loss": 0.0018,
      "num_tokens": 26623753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 508
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.16966666666666666,
      "grad_norm": 4.581507710099686e-06,
      "kl": 0.04547119140625,
      "learning_rate": 1.970856076304012e-05,
      "loss": 0.0018,
      "num_tokens": 26700681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 509
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17,
      "grad_norm": 5.098050223750761e-06,
      "kl": 0.04620361328125,
      "learning_rate": 1.97057655829999e-05,
      "loss": 0.0018,
      "num_tokens": 26775929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 510
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17033333333333334,
      "grad_norm": 5.974345185677521e-06,
      "kl": 0.04779052734375,
      "learning_rate": 1.9702957262759964e-05,
      "loss": 0.0019,
      "num_tokens": 26850905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 511
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17066666666666666,
      "grad_norm": 0.0003325820725876838,
      "kl": 0.04296875,
      "learning_rate": 1.9700135806122378e-05,
      "loss": 0.0017,
      "num_tokens": 26930553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 512
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.171,
      "grad_norm": 0.00033304333919659257,
      "kl": 0.04541015625,
      "learning_rate": 1.969730121690698e-05,
      "loss": 0.0018,
      "num_tokens": 27005097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 513
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17133333333333334,
      "grad_norm": 2.6074931156472303e-06,
      "kl": 0.04351806640625,
      "learning_rate": 1.9694453498951392e-05,
      "loss": 0.0017,
      "num_tokens": 27078441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 514
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17166666666666666,
      "grad_norm": 0.00036497588735073805,
      "kl": 0.04791259765625,
      "learning_rate": 1.969159265611101e-05,
      "loss": 0.0019,
      "num_tokens": 27155305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 515
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.172,
      "grad_norm": 4.051561063533882e-06,
      "kl": 0.04730224609375,
      "learning_rate": 1.9688718692259007e-05,
      "loss": 0.0019,
      "num_tokens": 27231033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 516
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17233333333333334,
      "grad_norm": 7.149666998884641e-06,
      "kl": 0.0462646484375,
      "learning_rate": 1.9685831611286312e-05,
      "loss": 0.0019,
      "num_tokens": 27308425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 517
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17266666666666666,
      "grad_norm": 2.3875866190792294e-06,
      "kl": 0.0418701171875,
      "learning_rate": 1.968293141710161e-05,
      "loss": 0.0017,
      "num_tokens": 27382729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 518
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.173,
      "grad_norm": 4.664221705752425e-06,
      "kl": 0.04730224609375,
      "learning_rate": 1.9680018113631347e-05,
      "loss": 0.0019,
      "num_tokens": 27457977.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 519
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17333333333333334,
      "grad_norm": 2.3671229882893385e-06,
      "kl": 0.04437255859375,
      "learning_rate": 1.9677091704819714e-05,
      "loss": 0.0018,
      "num_tokens": 27532809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 520
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17366666666666666,
      "grad_norm": 3.5438849863567157e-06,
      "kl": 0.04730224609375,
      "learning_rate": 1.967415219462864e-05,
      "loss": 0.0019,
      "num_tokens": 27608233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 521
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.174,
      "grad_norm": 0.00036429663305170834,
      "kl": 0.04412841796875,
      "learning_rate": 1.96711995870378e-05,
      "loss": 0.0018,
      "num_tokens": 27685001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 522
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17433333333333334,
      "grad_norm": 3.541435717124841e-06,
      "kl": 0.04229736328125,
      "learning_rate": 1.9668233886044597e-05,
      "loss": 0.0017,
      "num_tokens": 27760857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 523
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17466666666666666,
      "grad_norm": 3.7687332223867998e-06,
      "kl": 0.04852294921875,
      "learning_rate": 1.9665255095664155e-05,
      "loss": 0.0019,
      "num_tokens": 27839465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 524
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.175,
      "grad_norm": 3.8078387660789303e-06,
      "kl": 0.044921875,
      "learning_rate": 1.966226321992933e-05,
      "loss": 0.0018,
      "num_tokens": 27914441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 525
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17533333333333334,
      "grad_norm": 4.219351922074566e-06,
      "kl": 0.05010986328125,
      "learning_rate": 1.9659258262890683e-05,
      "loss": 0.002,
      "num_tokens": 27989225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 526
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17566666666666667,
      "grad_norm": 4.180098585493397e-06,
      "kl": 0.04632568359375,
      "learning_rate": 1.9656240228616496e-05,
      "loss": 0.0019,
      "num_tokens": 28066457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 527
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.176,
      "grad_norm": 3.0759954370296327e-06,
      "kl": 0.04669189453125,
      "learning_rate": 1.9653209121192747e-05,
      "loss": 0.0019,
      "num_tokens": 28142601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 528
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17633333333333334,
      "grad_norm": 1.986911911444622e-06,
      "kl": 0.04364013671875,
      "learning_rate": 1.9650164944723116e-05,
      "loss": 0.0017,
      "num_tokens": 28218105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 529
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17666666666666667,
      "grad_norm": 4.626205736713018e-06,
      "kl": 0.0460205078125,
      "learning_rate": 1.964710770332898e-05,
      "loss": 0.0018,
      "num_tokens": 28294137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 530
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.177,
      "grad_norm": 2.776182554953266e-06,
      "kl": 0.0447998046875,
      "learning_rate": 1.964403740114939e-05,
      "loss": 0.0018,
      "num_tokens": 28369001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 531
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17733333333333334,
      "grad_norm": 2.6137449822272174e-06,
      "kl": 0.0411376953125,
      "learning_rate": 1.96409540423411e-05,
      "loss": 0.0016,
      "num_tokens": 28448153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 532
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17766666666666667,
      "grad_norm": 2.546193627495086e-06,
      "kl": 0.04632568359375,
      "learning_rate": 1.9637857631078532e-05,
      "loss": 0.0019,
      "num_tokens": 28525161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 533
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.178,
      "grad_norm": 2.155697984562721e-06,
      "kl": 0.04559326171875,
      "learning_rate": 1.9634748171553775e-05,
      "loss": 0.0018,
      "num_tokens": 28607017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 534
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17833333333333334,
      "grad_norm": 0.00035390304401516914,
      "kl": 0.0439453125,
      "learning_rate": 1.9631625667976584e-05,
      "loss": 0.0018,
      "num_tokens": 28682473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 535
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17866666666666667,
      "grad_norm": 2.281694378325483e-06,
      "kl": 0.04302978515625,
      "learning_rate": 1.962849012457438e-05,
      "loss": 0.0017,
      "num_tokens": 28758553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 536
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.179,
      "grad_norm": 3.3165251807076856e-06,
      "kl": 0.04718017578125,
      "learning_rate": 1.9625341545592226e-05,
      "loss": 0.0019,
      "num_tokens": 28835049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 537
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17933333333333334,
      "grad_norm": 3.200831315552932e-06,
      "kl": 0.04412841796875,
      "learning_rate": 1.9622179935292855e-05,
      "loss": 0.0018,
      "num_tokens": 28910745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 538
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.17966666666666667,
      "grad_norm": 3.6182398162054596e-06,
      "kl": 0.04736328125,
      "learning_rate": 1.9619005297956623e-05,
      "loss": 0.0019,
      "num_tokens": 28988441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 539
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18,
      "grad_norm": 2.3984318886505207e-06,
      "kl": 0.045654296875,
      "learning_rate": 1.961581763788152e-05,
      "loss": 0.0018,
      "num_tokens": 29064473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 540
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18033333333333335,
      "grad_norm": 2.6171785521000857e-06,
      "kl": 0.04656982421875,
      "learning_rate": 1.961261695938319e-05,
      "loss": 0.0019,
      "num_tokens": 29139913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 541
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18066666666666667,
      "grad_norm": 1.799889219000761e-06,
      "kl": 0.04571533203125,
      "learning_rate": 1.960940326679488e-05,
      "loss": 0.0018,
      "num_tokens": 29220009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 542
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.181,
      "grad_norm": 2.294459136464866e-06,
      "kl": 0.04736328125,
      "learning_rate": 1.9606176564467465e-05,
      "loss": 0.0019,
      "num_tokens": 29294457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 543
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18133333333333335,
      "grad_norm": 0.00036517292028293014,
      "kl": 0.04388427734375,
      "learning_rate": 1.9602936856769432e-05,
      "loss": 0.0018,
      "num_tokens": 29369849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 544
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18166666666666667,
      "grad_norm": 1.4415670648304513e-06,
      "kl": 0.048095703125,
      "learning_rate": 1.9599684148086876e-05,
      "loss": 0.0019,
      "num_tokens": 29444297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 545
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.182,
      "grad_norm": 1.974558017536765e-06,
      "kl": 0.047607421875,
      "learning_rate": 1.9596418442823495e-05,
      "loss": 0.0019,
      "num_tokens": 29519065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 546
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18233333333333332,
      "grad_norm": 2.0785569176950958e-06,
      "kl": 0.04736328125,
      "learning_rate": 1.9593139745400575e-05,
      "loss": 0.0019,
      "num_tokens": 29602025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 547
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18266666666666667,
      "grad_norm": 2.3286127088795183e-06,
      "kl": 0.04534912109375,
      "learning_rate": 1.9589848060257007e-05,
      "loss": 0.0018,
      "num_tokens": 29679209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 548
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.183,
      "grad_norm": 2.620727173052728e-06,
      "kl": 0.04559326171875,
      "learning_rate": 1.9586543391849243e-05,
      "loss": 0.0018,
      "num_tokens": 29753833.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 549
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18333333333333332,
      "grad_norm": 2.065638000203762e-06,
      "kl": 0.0462646484375,
      "learning_rate": 1.9583225744651334e-05,
      "loss": 0.0018,
      "num_tokens": 29830553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 550
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18366666666666667,
      "grad_norm": 1.8364229390499531e-06,
      "kl": 0.04766845703125,
      "learning_rate": 1.957989512315489e-05,
      "loss": 0.0019,
      "num_tokens": 29907289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 551
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.184,
      "grad_norm": 1.488981069996953e-06,
      "kl": 0.044677734375,
      "learning_rate": 1.9576551531869092e-05,
      "loss": 0.0018,
      "num_tokens": 29982521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 552
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18433333333333332,
      "grad_norm": 2.238492925243918e-06,
      "kl": 0.04364013671875,
      "learning_rate": 1.9573194975320672e-05,
      "loss": 0.0017,
      "num_tokens": 30058473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 553
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18466666666666667,
      "grad_norm": 0.0006856236141175032,
      "kl": 0.0452880859375,
      "learning_rate": 1.956982545805393e-05,
      "loss": 0.0018,
      "num_tokens": 30136265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 554
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.185,
      "grad_norm": 2.4181592834793264e-06,
      "kl": 0.04620361328125,
      "learning_rate": 1.95664429846307e-05,
      "loss": 0.0018,
      "num_tokens": 30211449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 555
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18533333333333332,
      "grad_norm": 0.00036669173277914524,
      "kl": 0.0467529296875,
      "learning_rate": 1.9563047559630356e-05,
      "loss": 0.0019,
      "num_tokens": 30287193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 556
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18566666666666667,
      "grad_norm": 1.4481156540568918e-06,
      "kl": 0.04388427734375,
      "learning_rate": 1.9559639187649817e-05,
      "loss": 0.0018,
      "num_tokens": 30361401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 557
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.186,
      "grad_norm": 1.5893500631136703e-06,
      "kl": 0.0452880859375,
      "learning_rate": 1.9556217873303526e-05,
      "loss": 0.0018,
      "num_tokens": 30435385.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 558
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18633333333333332,
      "grad_norm": 1.0814345614562626e-06,
      "kl": 0.04583740234375,
      "learning_rate": 1.9552783621223437e-05,
      "loss": 0.0018,
      "num_tokens": 30508377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 559
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18666666666666668,
      "grad_norm": 1.5342546930696699e-06,
      "kl": 0.04217529296875,
      "learning_rate": 1.954933643605904e-05,
      "loss": 0.0017,
      "num_tokens": 30585049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 560
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.187,
      "grad_norm": 1.0698181540647056e-06,
      "kl": 0.0458984375,
      "learning_rate": 1.954587632247732e-05,
      "loss": 0.0018,
      "num_tokens": 30659465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 561
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18733333333333332,
      "grad_norm": 1.5908269688225118e-06,
      "kl": 0.04595947265625,
      "learning_rate": 1.954240328516277e-05,
      "loss": 0.0018,
      "num_tokens": 30736169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 562
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18766666666666668,
      "grad_norm": 1.179083255919977e-06,
      "kl": 0.04815673828125,
      "learning_rate": 1.9538917328817377e-05,
      "loss": 0.0019,
      "num_tokens": 30810521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 563
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.188,
      "grad_norm": 1.1328260143272928e-06,
      "kl": 0.04742431640625,
      "learning_rate": 1.9535418458160625e-05,
      "loss": 0.0019,
      "num_tokens": 30884361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 564
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18833333333333332,
      "grad_norm": 1.593016236256517e-06,
      "kl": 0.04571533203125,
      "learning_rate": 1.9531906677929472e-05,
      "loss": 0.0018,
      "num_tokens": 30958601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 565
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18866666666666668,
      "grad_norm": 0.00036747107515111566,
      "kl": 0.04644775390625,
      "learning_rate": 1.9528381992878362e-05,
      "loss": 0.0019,
      "num_tokens": 31033145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 566
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.189,
      "grad_norm": 1.2656346370931715e-06,
      "kl": 0.04498291015625,
      "learning_rate": 1.9524844407779208e-05,
      "loss": 0.0018,
      "num_tokens": 31109641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 567
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18933333333333333,
      "grad_norm": 0.00032583263237029314,
      "kl": 0.04669189453125,
      "learning_rate": 1.9521293927421388e-05,
      "loss": 0.0019,
      "num_tokens": 31186537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 568
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.18966666666666668,
      "grad_norm": 1.0520099067434785e-06,
      "kl": 0.04327392578125,
      "learning_rate": 1.951773055661174e-05,
      "loss": 0.0017,
      "num_tokens": 31262057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 569
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19,
      "grad_norm": 8.720533628547855e-07,
      "kl": 0.04473876953125,
      "learning_rate": 1.9514154300174542e-05,
      "loss": 0.0018,
      "num_tokens": 31338409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 570
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19033333333333333,
      "grad_norm": 1.0684104836400365e-06,
      "kl": 0.042236328125,
      "learning_rate": 1.9510565162951538e-05,
      "loss": 0.0017,
      "num_tokens": 31412489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 571
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19066666666666668,
      "grad_norm": 1.5664135162296589e-06,
      "kl": 0.045166015625,
      "learning_rate": 1.9506963149801894e-05,
      "loss": 0.0018,
      "num_tokens": 31491145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 572
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.191,
      "grad_norm": 1.0929404652415542e-06,
      "kl": 0.04669189453125,
      "learning_rate": 1.9503348265602212e-05,
      "loss": 0.0019,
      "num_tokens": 31567849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 573
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19133333333333333,
      "grad_norm": 9.160022500509513e-07,
      "kl": 0.04510498046875,
      "learning_rate": 1.9499720515246524e-05,
      "loss": 0.0018,
      "num_tokens": 31642729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 574
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19166666666666668,
      "grad_norm": 7.101962751221436e-07,
      "kl": 0.0440673828125,
      "learning_rate": 1.9496079903646282e-05,
      "loss": 0.0018,
      "num_tokens": 31715689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 575
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.192,
      "grad_norm": 1.3223746009316528e-06,
      "kl": 0.04315185546875,
      "learning_rate": 1.949242643573034e-05,
      "loss": 0.0017,
      "num_tokens": 31790825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 576
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19233333333333333,
      "grad_norm": 1.117964075092459e-06,
      "kl": 0.05084228515625,
      "learning_rate": 1.9488760116444966e-05,
      "loss": 0.002,
      "num_tokens": 31868441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 577
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19266666666666668,
      "grad_norm": 1.1153841796840425e-06,
      "kl": 0.044189453125,
      "learning_rate": 1.948508095075383e-05,
      "loss": 0.0018,
      "num_tokens": 31943769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 578
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.193,
      "grad_norm": 0.0003569263790268451,
      "kl": 0.04486083984375,
      "learning_rate": 1.9481388943637976e-05,
      "loss": 0.0018,
      "num_tokens": 32018985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 579
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19333333333333333,
      "grad_norm": 7.219896360766143e-07,
      "kl": 0.04736328125,
      "learning_rate": 1.947768410009586e-05,
      "loss": 0.0019,
      "num_tokens": 32093513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 580
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19366666666666665,
      "grad_norm": 1.1462284419394564e-06,
      "kl": 0.04608154296875,
      "learning_rate": 1.9473966425143292e-05,
      "loss": 0.0018,
      "num_tokens": 32168841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 581
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.194,
      "grad_norm": 1.1973814935117844e-06,
      "kl": 0.04541015625,
      "learning_rate": 1.947023592381348e-05,
      "loss": 0.0018,
      "num_tokens": 32245561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 582
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19433333333333333,
      "grad_norm": 1.0366587730459287e-06,
      "kl": 0.04290771484375,
      "learning_rate": 1.9466492601156964e-05,
      "loss": 0.0017,
      "num_tokens": 32321769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 583
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19466666666666665,
      "grad_norm": 1.0237714604954817e-06,
      "kl": 0.0428466796875,
      "learning_rate": 1.9462736462241672e-05,
      "loss": 0.0017,
      "num_tokens": 32397529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 584
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.195,
      "grad_norm": 9.8329178399581e-07,
      "kl": 0.04962158203125,
      "learning_rate": 1.9458967512152872e-05,
      "loss": 0.002,
      "num_tokens": 32472489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 585
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19533333333333333,
      "grad_norm": 9.641221367928665e-07,
      "kl": 0.04791259765625,
      "learning_rate": 1.945518575599317e-05,
      "loss": 0.0019,
      "num_tokens": 32547929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 586
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19566666666666666,
      "grad_norm": 1.0616884082992328e-06,
      "kl": 0.04296875,
      "learning_rate": 1.945139119888252e-05,
      "loss": 0.0017,
      "num_tokens": 32623689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 587
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.196,
      "grad_norm": 5.584269615610538e-07,
      "kl": 0.04595947265625,
      "learning_rate": 1.9447583845958198e-05,
      "loss": 0.0018,
      "num_tokens": 32697689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 588
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19633333333333333,
      "grad_norm": 0.00040306319715455174,
      "kl": 0.0452880859375,
      "learning_rate": 1.944376370237481e-05,
      "loss": 0.0018,
      "num_tokens": 32774393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 589
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19666666666666666,
      "grad_norm": 8.158833679772215e-07,
      "kl": 0.0484619140625,
      "learning_rate": 1.9439930773304284e-05,
      "loss": 0.0019,
      "num_tokens": 32849049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 590
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.197,
      "grad_norm": 9.90066382655641e-07,
      "kl": 0.04595947265625,
      "learning_rate": 1.9436085063935837e-05,
      "loss": 0.0018,
      "num_tokens": 32926345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 591
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19733333333333333,
      "grad_norm": 5.940618166278e-07,
      "kl": 0.0469970703125,
      "learning_rate": 1.943222657947601e-05,
      "loss": 0.0019,
      "num_tokens": 33001049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 592
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19766666666666666,
      "grad_norm": 1.0115753639183822e-06,
      "kl": 0.0438232421875,
      "learning_rate": 1.9428355325148632e-05,
      "loss": 0.0018,
      "num_tokens": 33076185.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 593
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.198,
      "grad_norm": 1.0544408723944798e-06,
      "kl": 0.046142578125,
      "learning_rate": 1.9424471306194822e-05,
      "loss": 0.0018,
      "num_tokens": 33151273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 594
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19833333333333333,
      "grad_norm": 7.935628332234046e-07,
      "kl": 0.04339599609375,
      "learning_rate": 1.942057452787297e-05,
      "loss": 0.0017,
      "num_tokens": 33224537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 595
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19866666666666666,
      "grad_norm": 5.599874839390395e-07,
      "kl": 0.04681396484375,
      "learning_rate": 1.9416664995458756e-05,
      "loss": 0.0019,
      "num_tokens": 33301929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 596
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.199,
      "grad_norm": 7.87715464412031e-07,
      "kl": 0.0439453125,
      "learning_rate": 1.941274271424512e-05,
      "loss": 0.0018,
      "num_tokens": 33379017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 597
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19933333333333333,
      "grad_norm": 5.585097255789151e-07,
      "kl": 0.03985595703125,
      "learning_rate": 1.9408807689542257e-05,
      "loss": 0.0016,
      "num_tokens": 33458233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 598
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.19966666666666666,
      "grad_norm": 7.517481890317868e-07,
      "kl": 0.04473876953125,
      "learning_rate": 1.9404859926677625e-05,
      "loss": 0.0018,
      "num_tokens": 33538089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 599
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2,
      "grad_norm": 7.894312830103445e-07,
      "kl": 0.0423583984375,
      "learning_rate": 1.9400899430995923e-05,
      "loss": 0.0017,
      "num_tokens": 33613417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 600
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20033333333333334,
      "grad_norm": 6.630521625083929e-07,
      "kl": 0.04644775390625,
      "learning_rate": 1.9396926207859085e-05,
      "loss": 0.0019,
      "num_tokens": 33688537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 601
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20066666666666666,
      "grad_norm": 9.316294153904892e-07,
      "kl": 0.052001953125,
      "learning_rate": 1.9392940262646284e-05,
      "loss": 0.0021,
      "num_tokens": 33767225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 602
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.201,
      "grad_norm": 6.783401431675884e-07,
      "kl": 0.04730224609375,
      "learning_rate": 1.9388941600753902e-05,
      "loss": 0.0019,
      "num_tokens": 33843817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 603
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20133333333333334,
      "grad_norm": 0.0003504432097543031,
      "kl": 0.047607421875,
      "learning_rate": 1.938493022759556e-05,
      "loss": 0.0019,
      "num_tokens": 33918441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 604
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20166666666666666,
      "grad_norm": 5.354993959372223e-07,
      "kl": 0.0474853515625,
      "learning_rate": 1.938090614860207e-05,
      "loss": 0.0019,
      "num_tokens": 33992201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 605
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.202,
      "grad_norm": 5.560214049182832e-07,
      "kl": 0.0455322265625,
      "learning_rate": 1.937686936922145e-05,
      "loss": 0.0018,
      "num_tokens": 34067289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 606
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20233333333333334,
      "grad_norm": 8.298476359414053e-07,
      "kl": 0.04718017578125,
      "learning_rate": 1.937281989491892e-05,
      "loss": 0.0019,
      "num_tokens": 34143113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 607
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20266666666666666,
      "grad_norm": 6.181532512528065e-07,
      "kl": 0.04736328125,
      "learning_rate": 1.936875773117687e-05,
      "loss": 0.0019,
      "num_tokens": 34220489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 608
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.203,
      "grad_norm": 6.222510364750633e-07,
      "kl": 0.04669189453125,
      "learning_rate": 1.9364682883494892e-05,
      "loss": 0.0019,
      "num_tokens": 34297113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 609
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20333333333333334,
      "grad_norm": 8.284929435831145e-07,
      "kl": 0.04608154296875,
      "learning_rate": 1.9360595357389735e-05,
      "loss": 0.0018,
      "num_tokens": 34375673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 610
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20366666666666666,
      "grad_norm": 7.932925427667215e-07,
      "kl": 0.045654296875,
      "learning_rate": 1.9356495158395317e-05,
      "loss": 0.0018,
      "num_tokens": 34451785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 611
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.204,
      "grad_norm": 7.791092571096669e-07,
      "kl": 0.04620361328125,
      "learning_rate": 1.9352382292062712e-05,
      "loss": 0.0018,
      "num_tokens": 34527769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 612
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20433333333333334,
      "grad_norm": 3.7999194546500803e-07,
      "kl": 0.04302978515625,
      "learning_rate": 1.9348256763960146e-05,
      "loss": 0.0017,
      "num_tokens": 34606473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 613
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20466666666666666,
      "grad_norm": 4.3525051296455786e-07,
      "kl": 0.0443115234375,
      "learning_rate": 1.9344118579672987e-05,
      "loss": 0.0018,
      "num_tokens": 34681321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 614
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.205,
      "grad_norm": 6.318659302451124e-07,
      "kl": 0.0447998046875,
      "learning_rate": 1.9339967744803735e-05,
      "loss": 0.0018,
      "num_tokens": 34757705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 615
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20533333333333334,
      "grad_norm": 4.347545541349973e-07,
      "kl": 0.04644775390625,
      "learning_rate": 1.9335804264972018e-05,
      "loss": 0.0019,
      "num_tokens": 34831449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 616
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20566666666666666,
      "grad_norm": 4.64629408725159e-07,
      "kl": 0.04742431640625,
      "learning_rate": 1.9331628145814587e-05,
      "loss": 0.0019,
      "num_tokens": 34906185.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 617
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.206,
      "grad_norm": 5.55207577690453e-07,
      "kl": 0.047119140625,
      "learning_rate": 1.93274393929853e-05,
      "loss": 0.0019,
      "num_tokens": 34981369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 618
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20633333333333334,
      "grad_norm": 6.003366479490069e-07,
      "kl": 0.04705810546875,
      "learning_rate": 1.9323238012155125e-05,
      "loss": 0.0019,
      "num_tokens": 35058137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 619
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20666666666666667,
      "grad_norm": 5.862183343197103e-07,
      "kl": 0.0379638671875,
      "learning_rate": 1.9319024009012114e-05,
      "loss": 0.0015,
      "num_tokens": 35138473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 620
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.207,
      "grad_norm": 6.453685159613087e-07,
      "kl": 0.042236328125,
      "learning_rate": 1.9314797389261426e-05,
      "loss": 0.0017,
      "num_tokens": 35214297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 621
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20733333333333334,
      "grad_norm": 6.374597205649479e-07,
      "kl": 0.04522705078125,
      "learning_rate": 1.9310558158625286e-05,
      "loss": 0.0018,
      "num_tokens": 35292393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 622
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20766666666666667,
      "grad_norm": 8.011625709514192e-07,
      "kl": 0.04840087890625,
      "learning_rate": 1.9306306322842994e-05,
      "loss": 0.0019,
      "num_tokens": 35369033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 623
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.208,
      "grad_norm": 5.33383285983291e-07,
      "kl": 0.04681396484375,
      "learning_rate": 1.930204188767093e-05,
      "loss": 0.0019,
      "num_tokens": 35443625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 624
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20833333333333334,
      "grad_norm": 5.202861075304099e-07,
      "kl": 0.0457763671875,
      "learning_rate": 1.9297764858882516e-05,
      "loss": 0.0018,
      "num_tokens": 35518489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 625
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20866666666666667,
      "grad_norm": 5.681910124621936e-07,
      "kl": 0.04473876953125,
      "learning_rate": 1.9293475242268224e-05,
      "loss": 0.0018,
      "num_tokens": 35592793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 626
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.209,
      "grad_norm": 3.2985295206344745e-07,
      "kl": 0.04791259765625,
      "learning_rate": 1.9289173043635584e-05,
      "loss": 0.0019,
      "num_tokens": 35666377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 627
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20933333333333334,
      "grad_norm": 7.614339097017364e-07,
      "kl": 0.04608154296875,
      "learning_rate": 1.9284858268809135e-05,
      "loss": 0.0018,
      "num_tokens": 35745065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 628
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.20966666666666667,
      "grad_norm": 7.926134344415914e-07,
      "kl": 0.046630859375,
      "learning_rate": 1.928053092363047e-05,
      "loss": 0.0019,
      "num_tokens": 35822537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 629
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21,
      "grad_norm": 0.000319549348205328,
      "kl": 0.04766845703125,
      "learning_rate": 1.927619101395818e-05,
      "loss": 0.0019,
      "num_tokens": 35904793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 630
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21033333333333334,
      "grad_norm": 7.020657335488067e-07,
      "kl": 0.046142578125,
      "learning_rate": 1.9271838545667876e-05,
      "loss": 0.0018,
      "num_tokens": 35981865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 631
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21066666666666667,
      "grad_norm": 6.815338906562829e-07,
      "kl": 0.0445556640625,
      "learning_rate": 1.9267473524652168e-05,
      "loss": 0.0018,
      "num_tokens": 36060969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 632
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.211,
      "grad_norm": 3.966312931424909e-07,
      "kl": 0.04913330078125,
      "learning_rate": 1.926309595682066e-05,
      "loss": 0.002,
      "num_tokens": 36134665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 633
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21133333333333335,
      "grad_norm": 6.308768547569343e-07,
      "kl": 0.04498291015625,
      "learning_rate": 1.925870584809995e-05,
      "loss": 0.0018,
      "num_tokens": 36210713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 634
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21166666666666667,
      "grad_norm": 5.821686386298097e-07,
      "kl": 0.0458984375,
      "learning_rate": 1.9254303204433602e-05,
      "loss": 0.0018,
      "num_tokens": 36287177.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 635
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.212,
      "grad_norm": 4.2193923377453757e-07,
      "kl": 0.04876708984375,
      "learning_rate": 1.924988803178216e-05,
      "loss": 0.002,
      "num_tokens": 36361977.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 636
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21233333333333335,
      "grad_norm": 5.425221161203808e-07,
      "kl": 0.0438232421875,
      "learning_rate": 1.9245460336123136e-05,
      "loss": 0.0018,
      "num_tokens": 36440729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 637
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21266666666666667,
      "grad_norm": 4.0420098912363756e-07,
      "kl": 0.04266357421875,
      "learning_rate": 1.9241020123450972e-05,
      "loss": 0.0017,
      "num_tokens": 36516649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 638
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.213,
      "grad_norm": 8.250989367297734e-07,
      "kl": 0.04742431640625,
      "learning_rate": 1.9236567399777086e-05,
      "loss": 0.0019,
      "num_tokens": 36594681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 639
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21333333333333335,
      "grad_norm": 5.732405838898558e-07,
      "kl": 0.04296875,
      "learning_rate": 1.923210217112981e-05,
      "loss": 0.0017,
      "num_tokens": 36673449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 640
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21366666666666667,
      "grad_norm": 5.579394155574846e-07,
      "kl": 0.0426025390625,
      "learning_rate": 1.9227624443554425e-05,
      "loss": 0.0017,
      "num_tokens": 36748649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 641
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.214,
      "grad_norm": 4.645865772090474e-07,
      "kl": 0.04852294921875,
      "learning_rate": 1.9223134223113122e-05,
      "loss": 0.0019,
      "num_tokens": 36823129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 642
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21433333333333332,
      "grad_norm": 3.995492647845822e-07,
      "kl": 0.0423583984375,
      "learning_rate": 1.9218631515885007e-05,
      "loss": 0.0017,
      "num_tokens": 36897753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 643
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21466666666666667,
      "grad_norm": 4.308226095872669e-07,
      "kl": 0.0445556640625,
      "learning_rate": 1.9214116327966095e-05,
      "loss": 0.0018,
      "num_tokens": 36976425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 644
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.215,
      "grad_norm": 2.6320984147787385e-07,
      "kl": 0.04473876953125,
      "learning_rate": 1.9209588665469294e-05,
      "loss": 0.0018,
      "num_tokens": 37050441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 645
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21533333333333332,
      "grad_norm": 4.1645921555755194e-07,
      "kl": 0.04864501953125,
      "learning_rate": 1.9205048534524405e-05,
      "loss": 0.0019,
      "num_tokens": 37126089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 646
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21566666666666667,
      "grad_norm": 3.734244558017963e-07,
      "kl": 0.044189453125,
      "learning_rate": 1.9200495941278105e-05,
      "loss": 0.0018,
      "num_tokens": 37202809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 647
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.216,
      "grad_norm": 3.4602098253344593e-07,
      "kl": 0.04730224609375,
      "learning_rate": 1.9195930891893946e-05,
      "loss": 0.0019,
      "num_tokens": 37276393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 648
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21633333333333332,
      "grad_norm": 3.98334606188655e-07,
      "kl": 0.0435791015625,
      "learning_rate": 1.9191353392552346e-05,
      "loss": 0.0017,
      "num_tokens": 37353353.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 649
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21666666666666667,
      "grad_norm": 3.844149318865675e-07,
      "kl": 0.0479736328125,
      "learning_rate": 1.9186763449450572e-05,
      "loss": 0.0019,
      "num_tokens": 37428585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 650
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.217,
      "grad_norm": 4.270441422704607e-07,
      "kl": 0.04534912109375,
      "learning_rate": 1.9182161068802742e-05,
      "loss": 0.0018,
      "num_tokens": 37506489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 651
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21733333333333332,
      "grad_norm": 6.118355031503597e-07,
      "kl": 0.04656982421875,
      "learning_rate": 1.9177546256839814e-05,
      "loss": 0.0019,
      "num_tokens": 37582201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 652
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21766666666666667,
      "grad_norm": 4.588695503571216e-07,
      "kl": 0.04656982421875,
      "learning_rate": 1.9172919019809572e-05,
      "loss": 0.0019,
      "num_tokens": 37657225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 653
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.218,
      "grad_norm": 3.8411499758694845e-07,
      "kl": 0.04534912109375,
      "learning_rate": 1.9168279363976627e-05,
      "loss": 0.0018,
      "num_tokens": 37731577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 654
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21833333333333332,
      "grad_norm": 4.050346831263596e-07,
      "kl": 0.046875,
      "learning_rate": 1.9163627295622397e-05,
      "loss": 0.0019,
      "num_tokens": 37811289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 655
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21866666666666668,
      "grad_norm": 3.476679069081001e-07,
      "kl": 0.0487060546875,
      "learning_rate": 1.9158962821045113e-05,
      "loss": 0.0019,
      "num_tokens": 37886489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 656
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.219,
      "grad_norm": 3.4760830658342456e-07,
      "kl": 0.04766845703125,
      "learning_rate": 1.9154285946559792e-05,
      "loss": 0.0019,
      "num_tokens": 37960617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 657
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21933333333333332,
      "grad_norm": 3.754953468160238e-07,
      "kl": 0.0458984375,
      "learning_rate": 1.914959667849825e-05,
      "loss": 0.0018,
      "num_tokens": 38036425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 658
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.21966666666666668,
      "grad_norm": 3.688506637899991e-07,
      "kl": 0.04632568359375,
      "learning_rate": 1.9144895023209072e-05,
      "loss": 0.0019,
      "num_tokens": 38110089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 659
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22,
      "grad_norm": 6.699099799334363e-07,
      "kl": 0.045654296875,
      "learning_rate": 1.914018098705762e-05,
      "loss": 0.0018,
      "num_tokens": 38186473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 660
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22033333333333333,
      "grad_norm": 7.001960966590559e-07,
      "kl": 0.0458984375,
      "learning_rate": 1.913545457642601e-05,
      "loss": 0.0018,
      "num_tokens": 38263321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 661
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22066666666666668,
      "grad_norm": 4.042692864913988e-07,
      "kl": 0.044921875,
      "learning_rate": 1.9130715797713123e-05,
      "loss": 0.0018,
      "num_tokens": 38339241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 662
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.221,
      "grad_norm": 6.046190605957236e-07,
      "kl": 0.05047607421875,
      "learning_rate": 1.912596465733458e-05,
      "loss": 0.002,
      "num_tokens": 38415049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 663
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22133333333333333,
      "grad_norm": 4.457091051790485e-07,
      "kl": 0.043701171875,
      "learning_rate": 1.9121201161722732e-05,
      "loss": 0.0017,
      "num_tokens": 38492601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 664
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22166666666666668,
      "grad_norm": 6.932128258085868e-07,
      "kl": 0.0479736328125,
      "learning_rate": 1.911642531732666e-05,
      "loss": 0.0019,
      "num_tokens": 38570665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 665
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.222,
      "grad_norm": 4.7143629444690305e-07,
      "kl": 0.048583984375,
      "learning_rate": 1.9111637130612172e-05,
      "loss": 0.0019,
      "num_tokens": 38645113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 666
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22233333333333333,
      "grad_norm": 5.260311013444152e-07,
      "kl": 0.04644775390625,
      "learning_rate": 1.910683660806177e-05,
      "loss": 0.0019,
      "num_tokens": 38720313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 667
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22266666666666668,
      "grad_norm": 4.739439418699476e-07,
      "kl": 0.04656982421875,
      "learning_rate": 1.9102023756174675e-05,
      "loss": 0.0019,
      "num_tokens": 38795657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 668
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.223,
      "grad_norm": 4.6509964590768504e-07,
      "kl": 0.044189453125,
      "learning_rate": 1.9097198581466785e-05,
      "loss": 0.0018,
      "num_tokens": 38871081.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 669
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22333333333333333,
      "grad_norm": 3.4570342677398003e-07,
      "kl": 0.04705810546875,
      "learning_rate": 1.9092361090470688e-05,
      "loss": 0.0019,
      "num_tokens": 38945561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 670
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22366666666666668,
      "grad_norm": 5.26558835645119e-07,
      "kl": 0.0509033203125,
      "learning_rate": 1.9087511289735646e-05,
      "loss": 0.002,
      "num_tokens": 39022009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 671
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.224,
      "grad_norm": 4.127924171370978e-07,
      "kl": 0.04541015625,
      "learning_rate": 1.9082649185827583e-05,
      "loss": 0.0018,
      "num_tokens": 39097977.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 672
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22433333333333333,
      "grad_norm": 4.78803087844426e-07,
      "kl": 0.0491943359375,
      "learning_rate": 1.907777478532909e-05,
      "loss": 0.002,
      "num_tokens": 39174041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 673
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22466666666666665,
      "grad_norm": 3.9622898384550354e-07,
      "kl": 0.04779052734375,
      "learning_rate": 1.907288809483939e-05,
      "loss": 0.0019,
      "num_tokens": 39247513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 674
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.225,
      "grad_norm": 6.678451427433174e-07,
      "kl": 0.04833984375,
      "learning_rate": 1.906798912097436e-05,
      "loss": 0.0019,
      "num_tokens": 39324569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 675
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22533333333333333,
      "grad_norm": 6.836395414211438e-07,
      "kl": 0.04388427734375,
      "learning_rate": 1.9063077870366504e-05,
      "loss": 0.0018,
      "num_tokens": 39402121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 676
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22566666666666665,
      "grad_norm": 5.298132919051568e-07,
      "kl": 0.04498291015625,
      "learning_rate": 1.9058154349664932e-05,
      "loss": 0.0018,
      "num_tokens": 39478441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 677
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.226,
      "grad_norm": 5.294585889714654e-07,
      "kl": 0.05047607421875,
      "learning_rate": 1.9053218565535383e-05,
      "loss": 0.002,
      "num_tokens": 39552841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 678
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22633333333333333,
      "grad_norm": 6.245261943149671e-07,
      "kl": 0.04461669921875,
      "learning_rate": 1.9048270524660197e-05,
      "loss": 0.0018,
      "num_tokens": 39629593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 679
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22666666666666666,
      "grad_norm": 6.321162686617754e-07,
      "kl": 0.0428466796875,
      "learning_rate": 1.90433102337383e-05,
      "loss": 0.0017,
      "num_tokens": 39706569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 680
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.227,
      "grad_norm": 3.668321539862518e-07,
      "kl": 0.04547119140625,
      "learning_rate": 1.9038337699485207e-05,
      "loss": 0.0018,
      "num_tokens": 39781769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 681
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22733333333333333,
      "grad_norm": 5.203608566262119e-07,
      "kl": 0.04217529296875,
      "learning_rate": 1.903335292863301e-05,
      "loss": 0.0017,
      "num_tokens": 39862761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 682
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22766666666666666,
      "grad_norm": 3.6629140254262893e-07,
      "kl": 0.05181884765625,
      "learning_rate": 1.9028355927930363e-05,
      "loss": 0.0021,
      "num_tokens": 39937433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 683
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.228,
      "grad_norm": 3.0064828138165467e-07,
      "kl": 0.0489501953125,
      "learning_rate": 1.9023346704142488e-05,
      "loss": 0.002,
      "num_tokens": 40012009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 684
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22833333333333333,
      "grad_norm": 7.485186870326288e-07,
      "kl": 0.04705810546875,
      "learning_rate": 1.901832526405114e-05,
      "loss": 0.0019,
      "num_tokens": 40087401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 685
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22866666666666666,
      "grad_norm": 5.358536441235628e-07,
      "kl": 0.0478515625,
      "learning_rate": 1.9013291614454622e-05,
      "loss": 0.0019,
      "num_tokens": 40163225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 686
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.229,
      "grad_norm": 5.794017852167599e-07,
      "kl": 0.04559326171875,
      "learning_rate": 1.9008245762167773e-05,
      "loss": 0.0018,
      "num_tokens": 40240793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 687
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22933333333333333,
      "grad_norm": 3.6473832665251393e-07,
      "kl": 0.044677734375,
      "learning_rate": 1.9003187714021936e-05,
      "loss": 0.0018,
      "num_tokens": 40316537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 688
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.22966666666666666,
      "grad_norm": 7.599930427204526e-07,
      "kl": 0.044677734375,
      "learning_rate": 1.8998117476864984e-05,
      "loss": 0.0018,
      "num_tokens": 40394793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 689
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23,
      "grad_norm": 5.503894158209732e-07,
      "kl": 0.04852294921875,
      "learning_rate": 1.8993035057561274e-05,
      "loss": 0.0019,
      "num_tokens": 40473193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 690
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23033333333333333,
      "grad_norm": 4.909775270789396e-07,
      "kl": 0.04400634765625,
      "learning_rate": 1.8987940462991673e-05,
      "loss": 0.0018,
      "num_tokens": 40546409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 691
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23066666666666666,
      "grad_norm": 5.05828552377352e-07,
      "kl": 0.0472412109375,
      "learning_rate": 1.8982833700053518e-05,
      "loss": 0.0019,
      "num_tokens": 40622073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 692
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.231,
      "grad_norm": 2.8664194928751385e-07,
      "kl": 0.04571533203125,
      "learning_rate": 1.897771477566063e-05,
      "loss": 0.0018,
      "num_tokens": 40696889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 693
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23133333333333334,
      "grad_norm": 4.128796149416303e-07,
      "kl": 0.0419921875,
      "learning_rate": 1.8972583696743284e-05,
      "loss": 0.0017,
      "num_tokens": 40778713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 694
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23166666666666666,
      "grad_norm": 3.268671378009458e-07,
      "kl": 0.04388427734375,
      "learning_rate": 1.8967440470248227e-05,
      "loss": 0.0018,
      "num_tokens": 40853913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 695
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.232,
      "grad_norm": 3.5997581449009886e-07,
      "kl": 0.04669189453125,
      "learning_rate": 1.8962285103138637e-05,
      "loss": 0.0019,
      "num_tokens": 40928441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 696
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23233333333333334,
      "grad_norm": 3.8002178825990995e-07,
      "kl": 0.04620361328125,
      "learning_rate": 1.895711760239413e-05,
      "loss": 0.0019,
      "num_tokens": 41007049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 697
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23266666666666666,
      "grad_norm": 4.90494642235717e-07,
      "kl": 0.04376220703125,
      "learning_rate": 1.895193797501076e-05,
      "loss": 0.0018,
      "num_tokens": 41081209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 698
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.233,
      "grad_norm": 4.93644108701119e-07,
      "kl": 0.046875,
      "learning_rate": 1.8946746228000987e-05,
      "loss": 0.0019,
      "num_tokens": 41158441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 699
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23333333333333334,
      "grad_norm": 4.896035079582362e-07,
      "kl": 0.05010986328125,
      "learning_rate": 1.8941542368393683e-05,
      "loss": 0.002,
      "num_tokens": 41232889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 700
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23366666666666666,
      "grad_norm": 3.914606168109458e-07,
      "kl": 0.0438232421875,
      "learning_rate": 1.8936326403234125e-05,
      "loss": 0.0018,
      "num_tokens": 41305865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 701
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.234,
      "grad_norm": 6.543713766404835e-07,
      "kl": 0.04534912109375,
      "learning_rate": 1.893109833958397e-05,
      "loss": 0.0018,
      "num_tokens": 41381497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 702
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23433333333333334,
      "grad_norm": 3.5247768437329796e-07,
      "kl": 0.0440673828125,
      "learning_rate": 1.892585818452126e-05,
      "loss": 0.0018,
      "num_tokens": 41457833.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 703
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23466666666666666,
      "grad_norm": 5.829222686770663e-07,
      "kl": 0.045654296875,
      "learning_rate": 1.8920605945140396e-05,
      "loss": 0.0018,
      "num_tokens": 41534841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 704
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.235,
      "grad_norm": 5.0437063237041e-07,
      "kl": 0.04449462890625,
      "learning_rate": 1.8915341628552166e-05,
      "loss": 0.0018,
      "num_tokens": 41612153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 705
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23533333333333334,
      "grad_norm": 6.390648081833206e-07,
      "kl": 0.04736328125,
      "learning_rate": 1.891006524188368e-05,
      "loss": 0.0019,
      "num_tokens": 41687449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 706
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23566666666666666,
      "grad_norm": 4.818857064492477e-07,
      "kl": 0.0482177734375,
      "learning_rate": 1.8904776792278403e-05,
      "loss": 0.0019,
      "num_tokens": 41763625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 707
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.236,
      "grad_norm": 4.535727669008338e-07,
      "kl": 0.04840087890625,
      "learning_rate": 1.889947628689613e-05,
      "loss": 0.0019,
      "num_tokens": 41839161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 708
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23633333333333334,
      "grad_norm": 5.368497113522608e-07,
      "kl": 0.0477294921875,
      "learning_rate": 1.889416373291298e-05,
      "loss": 0.0019,
      "num_tokens": 41915817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 709
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23666666666666666,
      "grad_norm": 2.403244536708371e-07,
      "kl": 0.0430908203125,
      "learning_rate": 1.888883913752137e-05,
      "loss": 0.0017,
      "num_tokens": 41989977.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 710
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.237,
      "grad_norm": 4.815542524738703e-07,
      "kl": 0.0452880859375,
      "learning_rate": 1.8883502507930044e-05,
      "loss": 0.0018,
      "num_tokens": 42065769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 711
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23733333333333334,
      "grad_norm": 4.835127924707194e-07,
      "kl": 0.04510498046875,
      "learning_rate": 1.8878153851364013e-05,
      "loss": 0.0018,
      "num_tokens": 42139449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 712
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23766666666666666,
      "grad_norm": 4.935627089253103e-07,
      "kl": 0.04754638671875,
      "learning_rate": 1.8872793175064594e-05,
      "loss": 0.0019,
      "num_tokens": 42215625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 713
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.238,
      "grad_norm": 4.2039519598802144e-07,
      "kl": 0.04962158203125,
      "learning_rate": 1.886742048628936e-05,
      "loss": 0.002,
      "num_tokens": 42291673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 714
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23833333333333334,
      "grad_norm": 5.261549631541129e-07,
      "kl": 0.0433349609375,
      "learning_rate": 1.8862035792312148e-05,
      "loss": 0.0017,
      "num_tokens": 42366505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 715
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23866666666666667,
      "grad_norm": 5.154719247002504e-07,
      "kl": 0.04608154296875,
      "learning_rate": 1.8856639100423062e-05,
      "loss": 0.0018,
      "num_tokens": 42450521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 716
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.239,
      "grad_norm": 6.216779979695275e-07,
      "kl": 0.04974365234375,
      "learning_rate": 1.8851230417928433e-05,
      "loss": 0.002,
      "num_tokens": 42526553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 717
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23933333333333334,
      "grad_norm": 4.082999680576904e-07,
      "kl": 0.04339599609375,
      "learning_rate": 1.884580975215084e-05,
      "loss": 0.0017,
      "num_tokens": 42601593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 718
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.23966666666666667,
      "grad_norm": 3.3806034593908407e-07,
      "kl": 0.0469970703125,
      "learning_rate": 1.8840377110429075e-05,
      "loss": 0.0019,
      "num_tokens": 42674649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 719
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24,
      "grad_norm": 1.1858687685162295e-06,
      "kl": 0.046630859375,
      "learning_rate": 1.8834932500118148e-05,
      "loss": 0.0019,
      "num_tokens": 42752361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 720
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24033333333333334,
      "grad_norm": 0.00032892401213757694,
      "kl": 0.0478515625,
      "learning_rate": 1.8829475928589272e-05,
      "loss": 0.0019,
      "num_tokens": 42827353.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 721
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24066666666666667,
      "grad_norm": 2.776667713533243e-07,
      "kl": 0.044921875,
      "learning_rate": 1.8824007403229852e-05,
      "loss": 0.0018,
      "num_tokens": 42899593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 722
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.241,
      "grad_norm": 3.1084536544767616e-07,
      "kl": 0.04315185546875,
      "learning_rate": 1.881852693144348e-05,
      "loss": 0.0017,
      "num_tokens": 42974265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 723
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24133333333333334,
      "grad_norm": 3.515064577186422e-07,
      "kl": 0.04388427734375,
      "learning_rate": 1.8813034520649923e-05,
      "loss": 0.0018,
      "num_tokens": 43048889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 724
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24166666666666667,
      "grad_norm": 2.918212658187258e-07,
      "kl": 0.04815673828125,
      "learning_rate": 1.880753017828511e-05,
      "loss": 0.0019,
      "num_tokens": 43121705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 725
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.242,
      "grad_norm": 3.23695360293641e-07,
      "kl": 0.04595947265625,
      "learning_rate": 1.880201391180111e-05,
      "loss": 0.0018,
      "num_tokens": 43198377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 726
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24233333333333335,
      "grad_norm": 3.559672165920347e-07,
      "kl": 0.04974365234375,
      "learning_rate": 1.879648572866617e-05,
      "loss": 0.002,
      "num_tokens": 43272377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 727
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24266666666666667,
      "grad_norm": 3.7746602288279973e-07,
      "kl": 0.04852294921875,
      "learning_rate": 1.8790945636364628e-05,
      "loss": 0.0019,
      "num_tokens": 43347433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 728
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.243,
      "grad_norm": 3.523225302615174e-07,
      "kl": 0.04315185546875,
      "learning_rate": 1.8785393642396976e-05,
      "loss": 0.0017,
      "num_tokens": 43422921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 729
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24333333333333335,
      "grad_norm": 2.891792121317849e-07,
      "kl": 0.0472412109375,
      "learning_rate": 1.8779829754279806e-05,
      "loss": 0.0019,
      "num_tokens": 43497113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 730
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24366666666666667,
      "grad_norm": 5.161469402992225e-07,
      "kl": 0.04510498046875,
      "learning_rate": 1.877425397954582e-05,
      "loss": 0.0018,
      "num_tokens": 43573401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 731
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.244,
      "grad_norm": 4.558390855891048e-07,
      "kl": 0.0487060546875,
      "learning_rate": 1.876866632574381e-05,
      "loss": 0.0019,
      "num_tokens": 43649369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 732
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24433333333333335,
      "grad_norm": 5.083584255771711e-07,
      "kl": 0.0450439453125,
      "learning_rate": 1.8763066800438638e-05,
      "loss": 0.0018,
      "num_tokens": 43725257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 733
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24466666666666667,
      "grad_norm": 3.0682838314533e-07,
      "kl": 0.04461669921875,
      "learning_rate": 1.875745541121126e-05,
      "loss": 0.0018,
      "num_tokens": 43803545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 734
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.245,
      "grad_norm": 2.645230097186868e-07,
      "kl": 0.04833984375,
      "learning_rate": 1.8751832165658682e-05,
      "loss": 0.0019,
      "num_tokens": 43877401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 735
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24533333333333332,
      "grad_norm": 4.874401042798127e-07,
      "kl": 0.04931640625,
      "learning_rate": 1.874619707139396e-05,
      "loss": 0.002,
      "num_tokens": 43952441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 736
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24566666666666667,
      "grad_norm": 4.381076905701775e-07,
      "kl": 0.044677734375,
      "learning_rate": 1.8740550136046195e-05,
      "loss": 0.0018,
      "num_tokens": 44026809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 737
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.246,
      "grad_norm": 4.1323556843053666e-07,
      "kl": 0.04632568359375,
      "learning_rate": 1.8734891367260528e-05,
      "loss": 0.0019,
      "num_tokens": 44103081.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 738
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24633333333333332,
      "grad_norm": 6.676098109892337e-07,
      "kl": 0.0484619140625,
      "learning_rate": 1.8729220772698096e-05,
      "loss": 0.0019,
      "num_tokens": 44179401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 739
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24666666666666667,
      "grad_norm": 4.623036318207596e-07,
      "kl": 0.045654296875,
      "learning_rate": 1.872353836003608e-05,
      "loss": 0.0018,
      "num_tokens": 44254057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 740
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.247,
      "grad_norm": 5.70516192510695e-07,
      "kl": 0.04852294921875,
      "learning_rate": 1.8717844136967626e-05,
      "loss": 0.0019,
      "num_tokens": 44330217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 741
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24733333333333332,
      "grad_norm": 3.8613060837633384e-07,
      "kl": 0.0433349609375,
      "learning_rate": 1.8712138111201898e-05,
      "loss": 0.0017,
      "num_tokens": 44405737.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 742
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24766666666666667,
      "grad_norm": 4.1050068944059603e-07,
      "kl": 0.04559326171875,
      "learning_rate": 1.870642029046402e-05,
      "loss": 0.0018,
      "num_tokens": 44480761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 743
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.248,
      "grad_norm": 5.079453444523097e-07,
      "kl": 0.0423583984375,
      "learning_rate": 1.87006906824951e-05,
      "loss": 0.0017,
      "num_tokens": 44558377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 744
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24833333333333332,
      "grad_norm": 3.411510363093839e-07,
      "kl": 0.0455322265625,
      "learning_rate": 1.869494929505219e-05,
      "loss": 0.0018,
      "num_tokens": 44631993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 745
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24866666666666667,
      "grad_norm": 3.024068746526609e-07,
      "kl": 0.04534912109375,
      "learning_rate": 1.8689196135908303e-05,
      "loss": 0.0018,
      "num_tokens": 44705193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 746
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.249,
      "grad_norm": 3.101951620010368e-07,
      "kl": 0.04217529296875,
      "learning_rate": 1.868343121285238e-05,
      "loss": 0.0017,
      "num_tokens": 44780217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 747
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24933333333333332,
      "grad_norm": 3.148875578062871e-07,
      "kl": 0.04498291015625,
      "learning_rate": 1.8677654533689287e-05,
      "loss": 0.0018,
      "num_tokens": 44855241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 748
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.24966666666666668,
      "grad_norm": 3.4707920804066816e-07,
      "kl": 0.04534912109375,
      "learning_rate": 1.8671866106239812e-05,
      "loss": 0.0018,
      "num_tokens": 44931225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 749
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25,
      "grad_norm": 3.905567780293495e-07,
      "kl": 0.0423583984375,
      "learning_rate": 1.866606593834065e-05,
      "loss": 0.0017,
      "num_tokens": 45008185.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 750
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25033333333333335,
      "grad_norm": 2.9572044013548293e-07,
      "kl": 0.04840087890625,
      "learning_rate": 1.866025403784439e-05,
      "loss": 0.0019,
      "num_tokens": 45081529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 751
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25066666666666665,
      "grad_norm": 2.1811952422012837e-07,
      "kl": 0.0435791015625,
      "learning_rate": 1.8654430412619494e-05,
      "loss": 0.0017,
      "num_tokens": 45159737.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 752
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.251,
      "grad_norm": 4.2060253235831624e-07,
      "kl": 0.04541015625,
      "learning_rate": 1.8648595070550312e-05,
      "loss": 0.0018,
      "num_tokens": 45235785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 753
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25133333333333335,
      "grad_norm": 5.803672706861107e-07,
      "kl": 0.0474853515625,
      "learning_rate": 1.864274801953705e-05,
      "loss": 0.0019,
      "num_tokens": 45313529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 754
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25166666666666665,
      "grad_norm": 3.2705352737139037e-07,
      "kl": 0.0484619140625,
      "learning_rate": 1.8636889267495767e-05,
      "loss": 0.0019,
      "num_tokens": 45387513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 755
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.252,
      "grad_norm": 2.806475549732568e-07,
      "kl": 0.046875,
      "learning_rate": 1.8631018822358363e-05,
      "loss": 0.0019,
      "num_tokens": 45462457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 756
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25233333333333335,
      "grad_norm": 3.355482647293684e-07,
      "kl": 0.045654296875,
      "learning_rate": 1.8625136692072577e-05,
      "loss": 0.0018,
      "num_tokens": 45536105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 757
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25266666666666665,
      "grad_norm": 3.23345375363715e-07,
      "kl": 0.04541015625,
      "learning_rate": 1.8619242884601953e-05,
      "loss": 0.0018,
      "num_tokens": 45611513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 758
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.253,
      "grad_norm": 2.710647493131546e-07,
      "kl": 0.04583740234375,
      "learning_rate": 1.8613337407925854e-05,
      "loss": 0.0018,
      "num_tokens": 45685417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 759
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25333333333333335,
      "grad_norm": 4.818232355319196e-07,
      "kl": 0.04400634765625,
      "learning_rate": 1.860742027003944e-05,
      "loss": 0.0018,
      "num_tokens": 45761305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 760
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25366666666666665,
      "grad_norm": 4.5073670662532095e-07,
      "kl": 0.04522705078125,
      "learning_rate": 1.860149147895366e-05,
      "loss": 0.0018,
      "num_tokens": 45836361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 761
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.254,
      "grad_norm": 3.986781109688309e-07,
      "kl": 0.046142578125,
      "learning_rate": 1.859555104269523e-05,
      "loss": 0.0018,
      "num_tokens": 45909721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 762
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25433333333333336,
      "grad_norm": 2.3543447014162666e-07,
      "kl": 0.04534912109375,
      "learning_rate": 1.8589598969306646e-05,
      "loss": 0.0018,
      "num_tokens": 45983065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 763
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25466666666666665,
      "grad_norm": 4.235543258346297e-07,
      "kl": 0.04498291015625,
      "learning_rate": 1.8583635266846155e-05,
      "loss": 0.0018,
      "num_tokens": 46059401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 764
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.255,
      "grad_norm": 5.743987685491447e-07,
      "kl": 0.0455322265625,
      "learning_rate": 1.8577659943387737e-05,
      "loss": 0.0018,
      "num_tokens": 46135865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 765
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25533333333333336,
      "grad_norm": 4.641596831334027e-07,
      "kl": 0.04595947265625,
      "learning_rate": 1.8571673007021124e-05,
      "loss": 0.0018,
      "num_tokens": 46211961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 766
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25566666666666665,
      "grad_norm": 3.290427912361338e-07,
      "kl": 0.0447998046875,
      "learning_rate": 1.8565674465851753e-05,
      "loss": 0.0018,
      "num_tokens": 46287641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 767
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.256,
      "grad_norm": 5.178353035262262e-07,
      "kl": 0.04522705078125,
      "learning_rate": 1.8559664328000782e-05,
      "loss": 0.0018,
      "num_tokens": 46362937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 768
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25633333333333336,
      "grad_norm": 4.079249151800468e-07,
      "kl": 0.040283203125,
      "learning_rate": 1.855364260160507e-05,
      "loss": 0.0016,
      "num_tokens": 46444633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 769
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25666666666666665,
      "grad_norm": 3.4808999771485105e-07,
      "kl": 0.04498291015625,
      "learning_rate": 1.854760929481715e-05,
      "loss": 0.0018,
      "num_tokens": 46518649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 770
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.257,
      "grad_norm": 3.6083434906686307e-07,
      "kl": 0.0447998046875,
      "learning_rate": 1.854156441580526e-05,
      "loss": 0.0018,
      "num_tokens": 46593257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 771
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25733333333333336,
      "grad_norm": 2.5221800115105e-07,
      "kl": 0.04656982421875,
      "learning_rate": 1.8535507972753275e-05,
      "loss": 0.0019,
      "num_tokens": 46667273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 772
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25766666666666665,
      "grad_norm": 2.962192411359865e-07,
      "kl": 0.04864501953125,
      "learning_rate": 1.852943997386075e-05,
      "loss": 0.0019,
      "num_tokens": 46741417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 773
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.258,
      "grad_norm": 3.311577927433973e-07,
      "kl": 0.0386962890625,
      "learning_rate": 1.8523360427342877e-05,
      "loss": 0.0015,
      "num_tokens": 46818857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 774
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25833333333333336,
      "grad_norm": 4.447210244507005e-07,
      "kl": 0.04302978515625,
      "learning_rate": 1.851726934143048e-05,
      "loss": 0.0017,
      "num_tokens": 46896217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 775
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25866666666666666,
      "grad_norm": 0.00033984804758802056,
      "kl": 0.04327392578125,
      "learning_rate": 1.8511166724369997e-05,
      "loss": 0.0017,
      "num_tokens": 46972713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 776
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.259,
      "grad_norm": 3.32584590978513e-07,
      "kl": 0.0462646484375,
      "learning_rate": 1.85050525844235e-05,
      "loss": 0.0019,
      "num_tokens": 47052249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 777
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25933333333333336,
      "grad_norm": 3.147251845803112e-07,
      "kl": 0.04473876953125,
      "learning_rate": 1.849892692986864e-05,
      "loss": 0.0018,
      "num_tokens": 47128025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 778
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.25966666666666666,
      "grad_norm": 4.90878960590635e-07,
      "kl": 0.04510498046875,
      "learning_rate": 1.8492789768998668e-05,
      "loss": 0.0018,
      "num_tokens": 47205561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 779
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.26,
      "grad_norm": 3.842663147679559e-07,
      "kl": 0.044921875,
      "learning_rate": 1.848664111012241e-05,
      "loss": 0.0018,
      "num_tokens": 47280457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 780
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.26033333333333336,
      "grad_norm": 3.071222920425498e-07,
      "kl": 0.04779052734375,
      "learning_rate": 1.848048096156426e-05,
      "loss": 0.0019,
      "num_tokens": 47356089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 781
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.26066666666666666,
      "grad_norm": 3.0678120310767554e-07,
      "kl": 0.0452880859375,
      "learning_rate": 1.8474309331664165e-05,
      "loss": 0.0018,
      "num_tokens": 47431497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 782
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.261,
      "grad_norm": 2.8356168968457496e-07,
      "kl": 0.04132080078125,
      "learning_rate": 1.8468126228777617e-05,
      "loss": 0.0017,
      "num_tokens": 47507161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 783
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2613333333333333,
      "grad_norm": 3.666902728127752e-07,
      "kl": 0.04412841796875,
      "learning_rate": 1.8461931661275642e-05,
      "loss": 0.0018,
      "num_tokens": 47586905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 784
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.26166666666666666,
      "grad_norm": 4.454333293324453e-07,
      "kl": 0.04132080078125,
      "learning_rate": 1.8455725637544784e-05,
      "loss": 0.0017,
      "num_tokens": 47664025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 785
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.262,
      "grad_norm": 3.260820164996403e-07,
      "kl": 0.047607421875,
      "learning_rate": 1.8449508165987106e-05,
      "loss": 0.0019,
      "num_tokens": 47736921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 786
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2623333333333333,
      "grad_norm": 2.53294899721368e-07,
      "kl": 0.04412841796875,
      "learning_rate": 1.8443279255020153e-05,
      "loss": 0.0018,
      "num_tokens": 47809481.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 787
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.26266666666666666,
      "grad_norm": 4.612111297319643e-07,
      "kl": 0.04510498046875,
      "learning_rate": 1.8437038913076974e-05,
      "loss": 0.0018,
      "num_tokens": 47884473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 788
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.263,
      "grad_norm": 2.8864297973996145e-07,
      "kl": 0.048828125,
      "learning_rate": 1.8430787148606087e-05,
      "loss": 0.002,
      "num_tokens": 47959449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 789
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2633333333333333,
      "grad_norm": 2.3924124548102554e-07,
      "kl": 0.04827880859375,
      "learning_rate": 1.842452397007148e-05,
      "loss": 0.0019,
      "num_tokens": 48033017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 790
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.26366666666666666,
      "grad_norm": 3.323621911022201e-07,
      "kl": 0.0482177734375,
      "learning_rate": 1.8418249385952575e-05,
      "loss": 0.0019,
      "num_tokens": 48109225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 791
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.264,
      "grad_norm": 3.8284713355096756e-07,
      "kl": 0.04345703125,
      "learning_rate": 1.8411963404744263e-05,
      "loss": 0.0017,
      "num_tokens": 48186553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 792
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2643333333333333,
      "grad_norm": 2.897108402066806e-07,
      "kl": 0.0440673828125,
      "learning_rate": 1.8405666034956842e-05,
      "loss": 0.0018,
      "num_tokens": 48261433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 793
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.26466666666666666,
      "grad_norm": 2.4835526346578263e-07,
      "kl": 0.04583740234375,
      "learning_rate": 1.8399357285116045e-05,
      "loss": 0.0018,
      "num_tokens": 48335705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 794
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.265,
      "grad_norm": 3.303800610865437e-07,
      "kl": 0.0460205078125,
      "learning_rate": 1.8393037163763005e-05,
      "loss": 0.0018,
      "num_tokens": 48412825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 795
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2653333333333333,
      "grad_norm": 3.7167754385336593e-07,
      "kl": 0.04705810546875,
      "learning_rate": 1.8386705679454243e-05,
      "loss": 0.0019,
      "num_tokens": 48488409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 796
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.26566666666666666,
      "grad_norm": 4.7247795009752735e-07,
      "kl": 0.04736328125,
      "learning_rate": 1.8380362840761675e-05,
      "loss": 0.0019,
      "num_tokens": 48568489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 797
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.266,
      "grad_norm": 2.2959727630222915e-07,
      "kl": 0.04705810546875,
      "learning_rate": 1.8374008656272585e-05,
      "loss": 0.0019,
      "num_tokens": 48642249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 798
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2663333333333333,
      "grad_norm": 3.831698620615498e-07,
      "kl": 0.04339599609375,
      "learning_rate": 1.836764313458962e-05,
      "loss": 0.0017,
      "num_tokens": 48720409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 799
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.26666666666666666,
      "grad_norm": 4.5236652113089804e-07,
      "kl": 0.0450439453125,
      "learning_rate": 1.836126628433077e-05,
      "loss": 0.0018,
      "num_tokens": 48795929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 800
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.267,
      "grad_norm": 4.7561653104821744e-07,
      "kl": 0.04736328125,
      "learning_rate": 1.8354878114129368e-05,
      "loss": 0.0019,
      "num_tokens": 48874521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 801
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2673333333333333,
      "grad_norm": 4.804242621503363e-07,
      "kl": 0.045654296875,
      "learning_rate": 1.8348478632634067e-05,
      "loss": 0.0018,
      "num_tokens": 48953065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 802
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.26766666666666666,
      "grad_norm": 3.1694065683041117e-07,
      "kl": 0.043212890625,
      "learning_rate": 1.8342067848508843e-05,
      "loss": 0.0017,
      "num_tokens": 49028649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 803
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.268,
      "grad_norm": 2.6863898483497906e-07,
      "kl": 0.04705810546875,
      "learning_rate": 1.8335645770432963e-05,
      "loss": 0.0019,
      "num_tokens": 49102777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 804
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2683333333333333,
      "grad_norm": 4.76823487360889e-07,
      "kl": 0.0452880859375,
      "learning_rate": 1.8329212407100996e-05,
      "loss": 0.0018,
      "num_tokens": 49180441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 805
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.26866666666666666,
      "grad_norm": 3.1277591006073635e-07,
      "kl": 0.04461669921875,
      "learning_rate": 1.832276776722278e-05,
      "loss": 0.0018,
      "num_tokens": 49255897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 806
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.269,
      "grad_norm": 4.953151346853701e-07,
      "kl": 0.0472412109375,
      "learning_rate": 1.831631185952342e-05,
      "loss": 0.0019,
      "num_tokens": 49332553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 807
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2693333333333333,
      "grad_norm": 2.5542902903907816e-07,
      "kl": 0.04595947265625,
      "learning_rate": 1.8309844692743283e-05,
      "loss": 0.0018,
      "num_tokens": 49407545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 808
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.26966666666666667,
      "grad_norm": 3.5005550103051064e-07,
      "kl": 0.04656982421875,
      "learning_rate": 1.8303366275637977e-05,
      "loss": 0.0019,
      "num_tokens": 49481577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 809
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.27,
      "grad_norm": 3.5171862577954016e-07,
      "kl": 0.04779052734375,
      "learning_rate": 1.8296876616978337e-05,
      "loss": 0.0019,
      "num_tokens": 49556841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 810
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2703333333333333,
      "grad_norm": 3.392306382465904e-07,
      "kl": 0.04425048828125,
      "learning_rate": 1.8290375725550417e-05,
      "loss": 0.0018,
      "num_tokens": 49631641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 811
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.27066666666666667,
      "grad_norm": 2.7070387886851677e-07,
      "kl": 0.04412841796875,
      "learning_rate": 1.828386361015549e-05,
      "loss": 0.0018,
      "num_tokens": 49705785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 812
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.271,
      "grad_norm": 2.6789638241098146e-07,
      "kl": 0.048583984375,
      "learning_rate": 1.827734027961001e-05,
      "loss": 0.0019,
      "num_tokens": 49781337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 813
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2713333333333333,
      "grad_norm": 1.6448592532469775e-07,
      "kl": 0.0450439453125,
      "learning_rate": 1.827080574274562e-05,
      "loss": 0.0018,
      "num_tokens": 49861385.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 814
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.27166666666666667,
      "grad_norm": 3.167813815707632e-07,
      "kl": 0.04876708984375,
      "learning_rate": 1.8264260008409138e-05,
      "loss": 0.0019,
      "num_tokens": 49935945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 815
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.272,
      "grad_norm": 2.637715965647658e-07,
      "kl": 0.04156494140625,
      "learning_rate": 1.8257703085462542e-05,
      "loss": 0.0017,
      "num_tokens": 50017225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 816
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2723333333333333,
      "grad_norm": 2.5883832677209284e-07,
      "kl": 0.04510498046875,
      "learning_rate": 1.8251134982782952e-05,
      "loss": 0.0018,
      "num_tokens": 50095513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 817
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.27266666666666667,
      "grad_norm": 2.8478842750701006e-07,
      "kl": 0.0426025390625,
      "learning_rate": 1.8244555709262627e-05,
      "loss": 0.0017,
      "num_tokens": 50170729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 818
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.273,
      "grad_norm": 4.098873489510879e-07,
      "kl": 0.04779052734375,
      "learning_rate": 1.823796527380895e-05,
      "loss": 0.0019,
      "num_tokens": 50245881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 819
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2733333333333333,
      "grad_norm": 2.421719784706511e-07,
      "kl": 0.04608154296875,
      "learning_rate": 1.8231363685344422e-05,
      "loss": 0.0018,
      "num_tokens": 50320265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 820
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.27366666666666667,
      "grad_norm": 2.761874782208906e-07,
      "kl": 0.04400634765625,
      "learning_rate": 1.8224750952806626e-05,
      "loss": 0.0018,
      "num_tokens": 50394809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 821
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.274,
      "grad_norm": 2.2788773890169978e-07,
      "kl": 0.04498291015625,
      "learning_rate": 1.8218127085148246e-05,
      "loss": 0.0018,
      "num_tokens": 50468281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 822
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2743333333333333,
      "grad_norm": 3.9760578829373117e-07,
      "kl": 0.04376220703125,
      "learning_rate": 1.821149209133704e-05,
      "loss": 0.0018,
      "num_tokens": 50544121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 823
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.27466666666666667,
      "grad_norm": 2.4715049562473723e-07,
      "kl": 0.04669189453125,
      "learning_rate": 1.8204845980355834e-05,
      "loss": 0.0019,
      "num_tokens": 50618329.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 824
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.275,
      "grad_norm": 3.326194644159841e-07,
      "kl": 0.046142578125,
      "learning_rate": 1.8198188761202487e-05,
      "loss": 0.0018,
      "num_tokens": 50693897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 825
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2753333333333333,
      "grad_norm": 4.2275809164493694e-07,
      "kl": 0.0460205078125,
      "learning_rate": 1.819152044288992e-05,
      "loss": 0.0018,
      "num_tokens": 50771017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 826
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.27566666666666667,
      "grad_norm": 4.921398044643865e-07,
      "kl": 0.0445556640625,
      "learning_rate": 1.818484103444606e-05,
      "loss": 0.0018,
      "num_tokens": 50847577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 827
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.276,
      "grad_norm": 5.2437275144257e-07,
      "kl": 0.0478515625,
      "learning_rate": 1.8178150544913867e-05,
      "loss": 0.0019,
      "num_tokens": 50926281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 828
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2763333333333333,
      "grad_norm": 4.7230884092641645e-07,
      "kl": 0.04473876953125,
      "learning_rate": 1.8171448983351284e-05,
      "loss": 0.0018,
      "num_tokens": 51000761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 829
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.27666666666666667,
      "grad_norm": 5.202701913731289e-07,
      "kl": 0.04638671875,
      "learning_rate": 1.8164736358831265e-05,
      "loss": 0.0019,
      "num_tokens": 51076969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 830
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.277,
      "grad_norm": 2.214551102497353e-07,
      "kl": 0.04296875,
      "learning_rate": 1.8158012680441723e-05,
      "loss": 0.0017,
      "num_tokens": 51151049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 831
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2773333333333333,
      "grad_norm": 1.864429606257545e-07,
      "kl": 0.041015625,
      "learning_rate": 1.815127795728554e-05,
      "loss": 0.0016,
      "num_tokens": 51224953.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 832
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2776666666666667,
      "grad_norm": 3.1296372071665246e-07,
      "kl": 0.04534912109375,
      "learning_rate": 1.814453219848057e-05,
      "loss": 0.0018,
      "num_tokens": 51300777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 833
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.278,
      "grad_norm": 2.567128376540495e-07,
      "kl": 0.04315185546875,
      "learning_rate": 1.813777541315958e-05,
      "loss": 0.0017,
      "num_tokens": 51376217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 834
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2783333333333333,
      "grad_norm": 2.9446914595609996e-07,
      "kl": 0.04632568359375,
      "learning_rate": 1.8131007610470278e-05,
      "loss": 0.0019,
      "num_tokens": 51452761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 835
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2786666666666667,
      "grad_norm": 2.92154226144703e-07,
      "kl": 0.0419921875,
      "learning_rate": 1.8124228799575295e-05,
      "loss": 0.0017,
      "num_tokens": 51527145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 836
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.279,
      "grad_norm": 4.3799781224151957e-07,
      "kl": 0.04571533203125,
      "learning_rate": 1.811743898965215e-05,
      "loss": 0.0018,
      "num_tokens": 51602777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 837
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2793333333333333,
      "grad_norm": 4.4972955492994515e-07,
      "kl": 0.0440673828125,
      "learning_rate": 1.8110638189893267e-05,
      "loss": 0.0018,
      "num_tokens": 51677993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 838
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2796666666666667,
      "grad_norm": 2.8373688110150397e-07,
      "kl": 0.04547119140625,
      "learning_rate": 1.8103826409505944e-05,
      "loss": 0.0018,
      "num_tokens": 51752985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 839
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.28,
      "grad_norm": 3.949029121486092e-07,
      "kl": 0.04632568359375,
      "learning_rate": 1.8097003657712343e-05,
      "loss": 0.0019,
      "num_tokens": 51827497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 840
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2803333333333333,
      "grad_norm": 2.2465060567355977e-07,
      "kl": 0.04547119140625,
      "learning_rate": 1.8090169943749477e-05,
      "loss": 0.0018,
      "num_tokens": 51901081.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 841
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2806666666666667,
      "grad_norm": 3.431296988765098e-07,
      "kl": 0.0439453125,
      "learning_rate": 1.8083325276869207e-05,
      "loss": 0.0018,
      "num_tokens": 51975625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 842
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.281,
      "grad_norm": 2.854775971172785e-07,
      "kl": 0.0416259765625,
      "learning_rate": 1.807646966633822e-05,
      "loss": 0.0017,
      "num_tokens": 52051049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 843
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2813333333333333,
      "grad_norm": 1.9648739169042528e-07,
      "kl": 0.0433349609375,
      "learning_rate": 1.806960312143802e-05,
      "loss": 0.0017,
      "num_tokens": 52125241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 844
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2816666666666667,
      "grad_norm": 2.4585619939898606e-07,
      "kl": 0.04327392578125,
      "learning_rate": 1.8062725651464913e-05,
      "loss": 0.0017,
      "num_tokens": 52200409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 845
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.282,
      "grad_norm": 3.198763351974776e-07,
      "kl": 0.04315185546875,
      "learning_rate": 1.8055837265729996e-05,
      "loss": 0.0017,
      "num_tokens": 52274393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 846
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2823333333333333,
      "grad_norm": 3.0104936854513653e-07,
      "kl": 0.0465087890625,
      "learning_rate": 1.804893797355914e-05,
      "loss": 0.0019,
      "num_tokens": 52347929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 847
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2826666666666667,
      "grad_norm": 2.0402315215051203e-07,
      "kl": 0.04656982421875,
      "learning_rate": 1.8042027784292998e-05,
      "loss": 0.0019,
      "num_tokens": 52423001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 848
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.283,
      "grad_norm": 2.624065018608235e-07,
      "kl": 0.04534912109375,
      "learning_rate": 1.8035106707286957e-05,
      "loss": 0.0018,
      "num_tokens": 52496521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 849
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2833333333333333,
      "grad_norm": 2.976613302507758e-07,
      "kl": 0.0447998046875,
      "learning_rate": 1.8028174751911147e-05,
      "loss": 0.0018,
      "num_tokens": 52571577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 850
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2836666666666667,
      "grad_norm": 3.7486356063709536e-07,
      "kl": 0.047607421875,
      "learning_rate": 1.802123192755044e-05,
      "loss": 0.0019,
      "num_tokens": 52645721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 851
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.284,
      "grad_norm": 2.776278051896952e-07,
      "kl": 0.0455322265625,
      "learning_rate": 1.8014278243604407e-05,
      "loss": 0.0018,
      "num_tokens": 52719785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 852
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2843333333333333,
      "grad_norm": 3.085508240019408e-07,
      "kl": 0.04376220703125,
      "learning_rate": 1.8007313709487334e-05,
      "loss": 0.0017,
      "num_tokens": 52795209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 853
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2846666666666667,
      "grad_norm": 5.131801117386203e-07,
      "kl": 0.0484619140625,
      "learning_rate": 1.800033833462819e-05,
      "loss": 0.0019,
      "num_tokens": 52870553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 854
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.285,
      "grad_norm": 5.162467004993232e-07,
      "kl": 0.04742431640625,
      "learning_rate": 1.7993352128470617e-05,
      "loss": 0.0019,
      "num_tokens": 52946249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 855
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2853333333333333,
      "grad_norm": 3.6292138361204707e-07,
      "kl": 0.045166015625,
      "learning_rate": 1.798635510047293e-05,
      "loss": 0.0018,
      "num_tokens": 53022265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 856
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2856666666666667,
      "grad_norm": 2.2523936138441059e-07,
      "kl": 0.04351806640625,
      "learning_rate": 1.7979347260108088e-05,
      "loss": 0.0017,
      "num_tokens": 53095801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 857
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.286,
      "grad_norm": 2.972224422137515e-07,
      "kl": 0.04571533203125,
      "learning_rate": 1.797232861686369e-05,
      "loss": 0.0018,
      "num_tokens": 53171497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 858
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.28633333333333333,
      "grad_norm": 3.185786567883042e-07,
      "kl": 0.04144287109375,
      "learning_rate": 1.7965299180241963e-05,
      "loss": 0.0017,
      "num_tokens": 53246665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 859
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2866666666666667,
      "grad_norm": 3.81515633307572e-07,
      "kl": 0.04229736328125,
      "learning_rate": 1.7958258959759747e-05,
      "loss": 0.0017,
      "num_tokens": 53321897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 860
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.287,
      "grad_norm": 4.322461109040887e-07,
      "kl": 0.04693603515625,
      "learning_rate": 1.795120796494848e-05,
      "loss": 0.0019,
      "num_tokens": 53398409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 861
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.28733333333333333,
      "grad_norm": 5.812352696921153e-07,
      "kl": 0.0474853515625,
      "learning_rate": 1.7944146205354182e-05,
      "loss": 0.0019,
      "num_tokens": 53475929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 862
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2876666666666667,
      "grad_norm": 3.7971406641190697e-07,
      "kl": 0.04779052734375,
      "learning_rate": 1.793707369053746e-05,
      "loss": 0.0019,
      "num_tokens": 53552137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 863
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.288,
      "grad_norm": 2.2259139598190814e-07,
      "kl": 0.0406494140625,
      "learning_rate": 1.7929990430073463e-05,
      "loss": 0.0016,
      "num_tokens": 53627721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 864
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.28833333333333333,
      "grad_norm": 4.0882432017497194e-07,
      "kl": 0.047119140625,
      "learning_rate": 1.792289643355191e-05,
      "loss": 0.0019,
      "num_tokens": 53704137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 865
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2886666666666667,
      "grad_norm": 2.311708868774076e-07,
      "kl": 0.04638671875,
      "learning_rate": 1.7915791710577035e-05,
      "loss": 0.0019,
      "num_tokens": 53778377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 866
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.289,
      "grad_norm": 4.044439094741392e-07,
      "kl": 0.04559326171875,
      "learning_rate": 1.7908676270767608e-05,
      "loss": 0.0018,
      "num_tokens": 53856073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 867
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.28933333333333333,
      "grad_norm": 2.2255395037973358e-07,
      "kl": 0.04461669921875,
      "learning_rate": 1.7901550123756906e-05,
      "loss": 0.0018,
      "num_tokens": 53930841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 868
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2896666666666667,
      "grad_norm": 6.428173833228357e-07,
      "kl": 0.04754638671875,
      "learning_rate": 1.7894413279192693e-05,
      "loss": 0.0019,
      "num_tokens": 54009897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 869
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.29,
      "grad_norm": 3.9727933653921355e-07,
      "kl": 0.04168701171875,
      "learning_rate": 1.7887265746737224e-05,
      "loss": 0.0017,
      "num_tokens": 54084233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 870
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.29033333333333333,
      "grad_norm": 2.666711793608556e-07,
      "kl": 0.0462646484375,
      "learning_rate": 1.788010753606722e-05,
      "loss": 0.0018,
      "num_tokens": 54159289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 871
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2906666666666667,
      "grad_norm": 3.67074449059146e-07,
      "kl": 0.04736328125,
      "learning_rate": 1.7872938656873864e-05,
      "loss": 0.0019,
      "num_tokens": 54236153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 872
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.291,
      "grad_norm": 3.2913288805502816e-07,
      "kl": 0.04962158203125,
      "learning_rate": 1.7865759118862784e-05,
      "loss": 0.002,
      "num_tokens": 54310169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 873
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.29133333333333333,
      "grad_norm": 3.2400063787463296e-07,
      "kl": 0.04327392578125,
      "learning_rate": 1.785856893175402e-05,
      "loss": 0.0017,
      "num_tokens": 54384985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 874
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2916666666666667,
      "grad_norm": 3.641858370428963e-07,
      "kl": 0.0450439453125,
      "learning_rate": 1.7851368105282054e-05,
      "loss": 0.0018,
      "num_tokens": 54464265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 875
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.292,
      "grad_norm": 4.1312321741315827e-07,
      "kl": 0.03985595703125,
      "learning_rate": 1.784415664919576e-05,
      "loss": 0.0016,
      "num_tokens": 54541881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 876
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.29233333333333333,
      "grad_norm": 2.482996137587179e-07,
      "kl": 0.048095703125,
      "learning_rate": 1.78369345732584e-05,
      "loss": 0.0019,
      "num_tokens": 54616297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 877
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2926666666666667,
      "grad_norm": 3.0637730219496007e-07,
      "kl": 0.046875,
      "learning_rate": 1.7829701887247618e-05,
      "loss": 0.0019,
      "num_tokens": 54692985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 878
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.293,
      "grad_norm": 3.00032922950777e-07,
      "kl": 0.04827880859375,
      "learning_rate": 1.7822458600955432e-05,
      "loss": 0.0019,
      "num_tokens": 54768745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 879
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.29333333333333333,
      "grad_norm": 3.8900296317478933e-07,
      "kl": 0.04766845703125,
      "learning_rate": 1.781520472418819e-05,
      "loss": 0.0019,
      "num_tokens": 54845465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 880
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2936666666666667,
      "grad_norm": 4.389564765006071e-07,
      "kl": 0.04437255859375,
      "learning_rate": 1.7807940266766595e-05,
      "loss": 0.0018,
      "num_tokens": 54922169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 881
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.294,
      "grad_norm": 2.6403799324725696e-07,
      "kl": 0.0447998046875,
      "learning_rate": 1.780066523852567e-05,
      "loss": 0.0018,
      "num_tokens": 54995785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 882
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.29433333333333334,
      "grad_norm": 2.7337617325429164e-07,
      "kl": 0.04443359375,
      "learning_rate": 1.7793379649314743e-05,
      "loss": 0.0018,
      "num_tokens": 55073961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 883
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2946666666666667,
      "grad_norm": 6.265134970817599e-07,
      "kl": 0.04644775390625,
      "learning_rate": 1.7786083508997452e-05,
      "loss": 0.0019,
      "num_tokens": 55152153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 884
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.295,
      "grad_norm": 5.472901420944254e-07,
      "kl": 0.0482177734375,
      "learning_rate": 1.7778776827451715e-05,
      "loss": 0.0019,
      "num_tokens": 55228553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 885
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.29533333333333334,
      "grad_norm": 2.1152351337150321e-07,
      "kl": 0.04718017578125,
      "learning_rate": 1.777145961456971e-05,
      "loss": 0.0019,
      "num_tokens": 55302281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 886
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2956666666666667,
      "grad_norm": 2.692565601591923e-07,
      "kl": 0.0491943359375,
      "learning_rate": 1.7764131880257892e-05,
      "loss": 0.002,
      "num_tokens": 55377193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 887
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.296,
      "grad_norm": 3.8602908603024844e-07,
      "kl": 0.04852294921875,
      "learning_rate": 1.7756793634436947e-05,
      "loss": 0.0019,
      "num_tokens": 55452937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 888
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.29633333333333334,
      "grad_norm": 2.8668037543866376e-07,
      "kl": 0.04498291015625,
      "learning_rate": 1.7749444887041797e-05,
      "loss": 0.0018,
      "num_tokens": 55527161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 889
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2966666666666667,
      "grad_norm": 4.927192094328348e-07,
      "kl": 0.0438232421875,
      "learning_rate": 1.774208564802158e-05,
      "loss": 0.0018,
      "num_tokens": 55603497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 890
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.297,
      "grad_norm": 2.734639963364316e-07,
      "kl": 0.04498291015625,
      "learning_rate": 1.7734715927339642e-05,
      "loss": 0.0018,
      "num_tokens": 55678265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 891
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.29733333333333334,
      "grad_norm": 3.410806357351248e-07,
      "kl": 0.04425048828125,
      "learning_rate": 1.7727335734973512e-05,
      "loss": 0.0018,
      "num_tokens": 55753465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 892
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2976666666666667,
      "grad_norm": 3.435504822846269e-07,
      "kl": 0.04461669921875,
      "learning_rate": 1.7719945080914902e-05,
      "loss": 0.0018,
      "num_tokens": 55829113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 893
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.298,
      "grad_norm": 2.851470526366029e-07,
      "kl": 0.045654296875,
      "learning_rate": 1.7712543975169687e-05,
      "loss": 0.0018,
      "num_tokens": 55903609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 894
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.29833333333333334,
      "grad_norm": 3.106248982476245e-07,
      "kl": 0.04766845703125,
      "learning_rate": 1.7705132427757895e-05,
      "loss": 0.0019,
      "num_tokens": 55978201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 895
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2986666666666667,
      "grad_norm": 4.185882573892741e-07,
      "kl": 0.04412841796875,
      "learning_rate": 1.769771044871368e-05,
      "loss": 0.0018,
      "num_tokens": 56053929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 896
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.299,
      "grad_norm": 4.752161260057619e-07,
      "kl": 0.048095703125,
      "learning_rate": 1.7690278048085327e-05,
      "loss": 0.0019,
      "num_tokens": 56132329.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 897
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.29933333333333334,
      "grad_norm": 1.4269281223278085e-07,
      "kl": 0.04473876953125,
      "learning_rate": 1.7682835235935236e-05,
      "loss": 0.0018,
      "num_tokens": 56205241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 898
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.2996666666666667,
      "grad_norm": 2.60033175436547e-07,
      "kl": 0.0484619140625,
      "learning_rate": 1.767538202233989e-05,
      "loss": 0.0019,
      "num_tokens": 56280233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 899
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3,
      "grad_norm": 4.6069982317931135e-07,
      "kl": 0.04437255859375,
      "learning_rate": 1.7667918417389857e-05,
      "loss": 0.0018,
      "num_tokens": 56357945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 900
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30033333333333334,
      "grad_norm": 5.101852025291009e-07,
      "kl": 0.04656982421875,
      "learning_rate": 1.766044443118978e-05,
      "loss": 0.0019,
      "num_tokens": 56436169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 901
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3006666666666667,
      "grad_norm": 3.986527872257284e-07,
      "kl": 0.04510498046875,
      "learning_rate": 1.765296007385836e-05,
      "loss": 0.0018,
      "num_tokens": 56511657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 902
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.301,
      "grad_norm": 4.7448915552195103e-07,
      "kl": 0.0465087890625,
      "learning_rate": 1.7645465355528317e-05,
      "loss": 0.0019,
      "num_tokens": 56589433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 903
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30133333333333334,
      "grad_norm": 3.2258756732517213e-07,
      "kl": 0.0443115234375,
      "learning_rate": 1.7637960286346423e-05,
      "loss": 0.0018,
      "num_tokens": 56664777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 904
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3016666666666667,
      "grad_norm": 3.5260444519735756e-07,
      "kl": 0.0467529296875,
      "learning_rate": 1.763044487647345e-05,
      "loss": 0.0019,
      "num_tokens": 56739961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 905
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.302,
      "grad_norm": 3.680910651837621e-07,
      "kl": 0.0440673828125,
      "learning_rate": 1.7622919136084183e-05,
      "loss": 0.0018,
      "num_tokens": 56816809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 906
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30233333333333334,
      "grad_norm": 2.2457147963450552e-07,
      "kl": 0.0439453125,
      "learning_rate": 1.761538307536737e-05,
      "loss": 0.0018,
      "num_tokens": 56892473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 907
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30266666666666664,
      "grad_norm": 4.272202147603821e-07,
      "kl": 0.04364013671875,
      "learning_rate": 1.760783670452575e-05,
      "loss": 0.0017,
      "num_tokens": 56970425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 908
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.303,
      "grad_norm": 2.722154874845728e-07,
      "kl": 0.04241943359375,
      "learning_rate": 1.7600280033776018e-05,
      "loss": 0.0017,
      "num_tokens": 57045177.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 909
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30333333333333334,
      "grad_norm": 4.124611052702676e-07,
      "kl": 0.04315185546875,
      "learning_rate": 1.759271307334881e-05,
      "loss": 0.0017,
      "num_tokens": 57122601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 910
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30366666666666664,
      "grad_norm": 2.497382070032472e-07,
      "kl": 0.0465087890625,
      "learning_rate": 1.7585135833488692e-05,
      "loss": 0.0019,
      "num_tokens": 57196281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 911
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.304,
      "grad_norm": 3.360646587680094e-07,
      "kl": 0.041259765625,
      "learning_rate": 1.7577548324454148e-05,
      "loss": 0.0017,
      "num_tokens": 57274073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 912
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30433333333333334,
      "grad_norm": 3.755442037345347e-07,
      "kl": 0.0496826171875,
      "learning_rate": 1.7569950556517566e-05,
      "loss": 0.002,
      "num_tokens": 57351017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 913
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30466666666666664,
      "grad_norm": 3.8886116726644104e-07,
      "kl": 0.04388427734375,
      "learning_rate": 1.7562342539965223e-05,
      "loss": 0.0018,
      "num_tokens": 57425753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 914
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.305,
      "grad_norm": 2.884500815980573e-07,
      "kl": 0.04595947265625,
      "learning_rate": 1.7554724285097272e-05,
      "loss": 0.0018,
      "num_tokens": 57500313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 915
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30533333333333335,
      "grad_norm": 3.6437225503505033e-07,
      "kl": 0.04486083984375,
      "learning_rate": 1.7547095802227723e-05,
      "loss": 0.0018,
      "num_tokens": 57576313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 916
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30566666666666664,
      "grad_norm": 4.301882370327803e-07,
      "kl": 0.04559326171875,
      "learning_rate": 1.7539457101684434e-05,
      "loss": 0.0018,
      "num_tokens": 57653193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 917
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.306,
      "grad_norm": 3.6725342056342924e-07,
      "kl": 0.04827880859375,
      "learning_rate": 1.7531808193809106e-05,
      "loss": 0.0019,
      "num_tokens": 57729689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 918
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30633333333333335,
      "grad_norm": 2.8100902227379265e-07,
      "kl": 0.04522705078125,
      "learning_rate": 1.7524149088957244e-05,
      "loss": 0.0018,
      "num_tokens": 57805353.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 919
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30666666666666664,
      "grad_norm": 4.2409971001688973e-07,
      "kl": 0.0440673828125,
      "learning_rate": 1.7516479797498172e-05,
      "loss": 0.0018,
      "num_tokens": 57881577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 920
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.307,
      "grad_norm": 2.477618465945852e-07,
      "kl": 0.04925537109375,
      "learning_rate": 1.7508800329814993e-05,
      "loss": 0.002,
      "num_tokens": 57955225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 921
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30733333333333335,
      "grad_norm": 3.451582983871049e-07,
      "kl": 0.0452880859375,
      "learning_rate": 1.7501110696304598e-05,
      "loss": 0.0018,
      "num_tokens": 58031321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 922
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30766666666666664,
      "grad_norm": 3.8743954178244167e-07,
      "kl": 0.04351806640625,
      "learning_rate": 1.749341090737763e-05,
      "loss": 0.0017,
      "num_tokens": 58106921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 923
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.308,
      "grad_norm": 3.9689820141575183e-07,
      "kl": 0.04718017578125,
      "learning_rate": 1.7485700973458494e-05,
      "loss": 0.0019,
      "num_tokens": 58182617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 924
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30833333333333335,
      "grad_norm": 6.127338565420359e-07,
      "kl": 0.0474853515625,
      "learning_rate": 1.747798090498532e-05,
      "loss": 0.0019,
      "num_tokens": 58261289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 925
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30866666666666664,
      "grad_norm": 3.2155134022104903e-07,
      "kl": 0.04595947265625,
      "learning_rate": 1.7470250712409963e-05,
      "loss": 0.0018,
      "num_tokens": 58337369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 926
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.309,
      "grad_norm": 5.713849873245636e-07,
      "kl": 0.04425048828125,
      "learning_rate": 1.746251040619798e-05,
      "loss": 0.0018,
      "num_tokens": 58418217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 927
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30933333333333335,
      "grad_norm": 3.990920447449753e-07,
      "kl": 0.0421142578125,
      "learning_rate": 1.7454759996828622e-05,
      "loss": 0.0017,
      "num_tokens": 58495241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 928
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.30966666666666665,
      "grad_norm": 2.7729507223739347e-07,
      "kl": 0.04541015625,
      "learning_rate": 1.744699949479483e-05,
      "loss": 0.0018,
      "num_tokens": 58569657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 929
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31,
      "grad_norm": 4.045870127811213e-07,
      "kl": 0.04571533203125,
      "learning_rate": 1.7439228910603184e-05,
      "loss": 0.0018,
      "num_tokens": 58645721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 930
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31033333333333335,
      "grad_norm": 5.207534172768646e-07,
      "kl": 0.04290771484375,
      "learning_rate": 1.7431448254773943e-05,
      "loss": 0.0017,
      "num_tokens": 58722697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 931
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31066666666666665,
      "grad_norm": 3.929250738110568e-07,
      "kl": 0.0489501953125,
      "learning_rate": 1.7423657537840978e-05,
      "loss": 0.002,
      "num_tokens": 58797369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 932
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.311,
      "grad_norm": 3.662092922240845e-07,
      "kl": 0.0447998046875,
      "learning_rate": 1.7415856770351797e-05,
      "loss": 0.0018,
      "num_tokens": 58874569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 933
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31133333333333335,
      "grad_norm": 3.5038416967836383e-07,
      "kl": 0.0496826171875,
      "learning_rate": 1.74080459628675e-05,
      "loss": 0.002,
      "num_tokens": 58950345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 934
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31166666666666665,
      "grad_norm": 3.007642135344213e-07,
      "kl": 0.04705810546875,
      "learning_rate": 1.7400225125962796e-05,
      "loss": 0.0019,
      "num_tokens": 59024313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 935
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.312,
      "grad_norm": 4.5071922727402125e-07,
      "kl": 0.0450439453125,
      "learning_rate": 1.739239427022596e-05,
      "loss": 0.0018,
      "num_tokens": 59099897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 936
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31233333333333335,
      "grad_norm": 0.00034065076033584774,
      "kl": 0.045166015625,
      "learning_rate": 1.7384553406258842e-05,
      "loss": 0.0018,
      "num_tokens": 59180329.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 937
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31266666666666665,
      "grad_norm": 5.446334512271278e-07,
      "kl": 0.04754638671875,
      "learning_rate": 1.7376702544676823e-05,
      "loss": 0.0019,
      "num_tokens": 59256793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 938
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.313,
      "grad_norm": 3.392544840608025e-07,
      "kl": 0.04736328125,
      "learning_rate": 1.736884169610884e-05,
      "loss": 0.0019,
      "num_tokens": 59331817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 939
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31333333333333335,
      "grad_norm": 2.0961358870863478e-07,
      "kl": 0.0478515625,
      "learning_rate": 1.7360970871197347e-05,
      "loss": 0.0019,
      "num_tokens": 59406393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 940
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31366666666666665,
      "grad_norm": 3.587256571790931e-07,
      "kl": 0.04534912109375,
      "learning_rate": 1.735309008059829e-05,
      "loss": 0.0018,
      "num_tokens": 59482473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 941
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.314,
      "grad_norm": 3.310771319320338e-07,
      "kl": 0.0467529296875,
      "learning_rate": 1.734519933498112e-05,
      "loss": 0.0019,
      "num_tokens": 59558041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 942
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31433333333333335,
      "grad_norm": 3.3980271041400556e-07,
      "kl": 0.04425048828125,
      "learning_rate": 1.7337298645028764e-05,
      "loss": 0.0018,
      "num_tokens": 59634313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 943
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31466666666666665,
      "grad_norm": 5.596580194833223e-07,
      "kl": 0.04815673828125,
      "learning_rate": 1.7329388021437615e-05,
      "loss": 0.0019,
      "num_tokens": 59712441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 944
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.315,
      "grad_norm": 0.0003744710411410779,
      "kl": 0.04437255859375,
      "learning_rate": 1.7321467474917502e-05,
      "loss": 0.0018,
      "num_tokens": 59788585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 945
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31533333333333335,
      "grad_norm": 4.3050246745224285e-07,
      "kl": 0.0472412109375,
      "learning_rate": 1.7313537016191706e-05,
      "loss": 0.0019,
      "num_tokens": 59870137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 946
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31566666666666665,
      "grad_norm": 3.356896343120752e-07,
      "kl": 0.0452880859375,
      "learning_rate": 1.7305596655996916e-05,
      "loss": 0.0018,
      "num_tokens": 59949209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 947
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.316,
      "grad_norm": 1.6127195578974352e-07,
      "kl": 0.0400390625,
      "learning_rate": 1.729764640508322e-05,
      "loss": 0.0016,
      "num_tokens": 60023209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 948
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31633333333333336,
      "grad_norm": 2.6903279604084673e-07,
      "kl": 0.04583740234375,
      "learning_rate": 1.7289686274214116e-05,
      "loss": 0.0018,
      "num_tokens": 60097129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 949
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31666666666666665,
      "grad_norm": 2.99013805715731e-07,
      "kl": 0.04278564453125,
      "learning_rate": 1.7281716274166464e-05,
      "loss": 0.0017,
      "num_tokens": 60173849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 950
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.317,
      "grad_norm": 2.4149562705133576e-07,
      "kl": 0.04486083984375,
      "learning_rate": 1.7273736415730488e-05,
      "loss": 0.0018,
      "num_tokens": 60248857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 951
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31733333333333336,
      "grad_norm": 3.660021548057557e-07,
      "kl": 0.04833984375,
      "learning_rate": 1.7265746709709762e-05,
      "loss": 0.0019,
      "num_tokens": 60325145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 952
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31766666666666665,
      "grad_norm": 1.8270475266035646e-07,
      "kl": 0.047119140625,
      "learning_rate": 1.7257747166921186e-05,
      "loss": 0.0019,
      "num_tokens": 60399081.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 953
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.318,
      "grad_norm": 1.8213741270756145e-07,
      "kl": 0.044921875,
      "learning_rate": 1.7249737798194982e-05,
      "loss": 0.0018,
      "num_tokens": 60479017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 954
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31833333333333336,
      "grad_norm": 1.6144012704444322e-07,
      "kl": 0.04534912109375,
      "learning_rate": 1.7241718614374678e-05,
      "loss": 0.0018,
      "num_tokens": 60554249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 955
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31866666666666665,
      "grad_norm": 2.262213882886499e-07,
      "kl": 0.0458984375,
      "learning_rate": 1.723368962631708e-05,
      "loss": 0.0018,
      "num_tokens": 60627545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 956
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.319,
      "grad_norm": 1.1377928643696578e-07,
      "kl": 0.03997802734375,
      "learning_rate": 1.722565084489228e-05,
      "loss": 0.0016,
      "num_tokens": 60701801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 957
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31933333333333336,
      "grad_norm": 2.4783636831671174e-07,
      "kl": 0.048583984375,
      "learning_rate": 1.7217602280983622e-05,
      "loss": 0.0019,
      "num_tokens": 60776873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 958
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.31966666666666665,
      "grad_norm": 3.2239969982583716e-07,
      "kl": 0.04345703125,
      "learning_rate": 1.7209543945487696e-05,
      "loss": 0.0017,
      "num_tokens": 60855401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 959
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.32,
      "grad_norm": 2.5958357241506747e-07,
      "kl": 0.0455322265625,
      "learning_rate": 1.720147584931431e-05,
      "loss": 0.0018,
      "num_tokens": 60932073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 960
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.32033333333333336,
      "grad_norm": 2.2727870430117036e-07,
      "kl": 0.04559326171875,
      "learning_rate": 1.7193398003386514e-05,
      "loss": 0.0018,
      "num_tokens": 61007337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 961
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.32066666666666666,
      "grad_norm": 1.5408605236189032e-07,
      "kl": 0.04693603515625,
      "learning_rate": 1.7185310418640525e-05,
      "loss": 0.0019,
      "num_tokens": 61081529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 962
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.321,
      "grad_norm": 2.603428583825007e-07,
      "kl": 0.04425048828125,
      "learning_rate": 1.7177213106025768e-05,
      "loss": 0.0018,
      "num_tokens": 61156937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 963
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.32133333333333336,
      "grad_norm": 2.9015737368354166e-07,
      "kl": 0.049072265625,
      "learning_rate": 1.716910607650483e-05,
      "loss": 0.002,
      "num_tokens": 61233257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 964
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.32166666666666666,
      "grad_norm": 4.746328841065406e-07,
      "kl": 0.04620361328125,
      "learning_rate": 1.716098934105345e-05,
      "loss": 0.0018,
      "num_tokens": 61311097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 965
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.322,
      "grad_norm": 2.7876174613084004e-07,
      "kl": 0.046875,
      "learning_rate": 1.7152862910660516e-05,
      "loss": 0.0019,
      "num_tokens": 61388057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 966
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.32233333333333336,
      "grad_norm": 3.3170377378155536e-07,
      "kl": 0.045166015625,
      "learning_rate": 1.7144726796328034e-05,
      "loss": 0.0018,
      "num_tokens": 61463945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 967
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.32266666666666666,
      "grad_norm": 2.6486006277082197e-07,
      "kl": 0.04180908203125,
      "learning_rate": 1.7136581009071126e-05,
      "loss": 0.0017,
      "num_tokens": 61545737.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 968
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.323,
      "grad_norm": 2.37801714320085e-07,
      "kl": 0.04315185546875,
      "learning_rate": 1.7128425559918006e-05,
      "loss": 0.0017,
      "num_tokens": 61621945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 969
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3233333333333333,
      "grad_norm": 1.7786433659239265e-07,
      "kl": 0.0479736328125,
      "learning_rate": 1.712026045990997e-05,
      "loss": 0.0019,
      "num_tokens": 61697177.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 970
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.32366666666666666,
      "grad_norm": 1.7683585440408933e-07,
      "kl": 0.04315185546875,
      "learning_rate": 1.711208572010137e-05,
      "loss": 0.0017,
      "num_tokens": 61770985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 971
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.324,
      "grad_norm": 2.1341801925700565e-07,
      "kl": 0.0411376953125,
      "learning_rate": 1.710390135155964e-05,
      "loss": 0.0016,
      "num_tokens": 61846889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 972
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3243333333333333,
      "grad_norm": 2.0760180063916778e-07,
      "kl": 0.0438232421875,
      "learning_rate": 1.709570736536521e-05,
      "loss": 0.0018,
      "num_tokens": 61921529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 973
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.32466666666666666,
      "grad_norm": 2.2944283273318433e-07,
      "kl": 0.04364013671875,
      "learning_rate": 1.708750377261156e-05,
      "loss": 0.0017,
      "num_tokens": 61998233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 974
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.325,
      "grad_norm": 1.726755840536498e-07,
      "kl": 0.04791259765625,
      "learning_rate": 1.7079290584405158e-05,
      "loss": 0.0019,
      "num_tokens": 62073081.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 975
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3253333333333333,
      "grad_norm": 3.4572346407912846e-07,
      "kl": 0.04595947265625,
      "learning_rate": 1.7071067811865477e-05,
      "loss": 0.0018,
      "num_tokens": 62151593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 976
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.32566666666666666,
      "grad_norm": 3.7549895637312147e-07,
      "kl": 0.0467529296875,
      "learning_rate": 1.7062835466124953e-05,
      "loss": 0.0019,
      "num_tokens": 62228265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 977
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.326,
      "grad_norm": 1.4343746101985744e-07,
      "kl": 0.0455322265625,
      "learning_rate": 1.7054593558328996e-05,
      "loss": 0.0018,
      "num_tokens": 62300985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 978
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3263333333333333,
      "grad_norm": 2.116904624926974e-07,
      "kl": 0.04571533203125,
      "learning_rate": 1.7046342099635948e-05,
      "loss": 0.0018,
      "num_tokens": 62376233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 979
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.32666666666666666,
      "grad_norm": 3.3913011066033505e-07,
      "kl": 0.04510498046875,
      "learning_rate": 1.7038081101217093e-05,
      "loss": 0.0018,
      "num_tokens": 62452809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 980
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.327,
      "grad_norm": 1.353988920982374e-07,
      "kl": 0.04315185546875,
      "learning_rate": 1.702981057425662e-05,
      "loss": 0.0017,
      "num_tokens": 62527209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 981
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3273333333333333,
      "grad_norm": 2.3559587702948193e-07,
      "kl": 0.04425048828125,
      "learning_rate": 1.7021530529951627e-05,
      "loss": 0.0018,
      "num_tokens": 62602617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 982
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.32766666666666666,
      "grad_norm": 1.7213457681464206e-07,
      "kl": 0.04522705078125,
      "learning_rate": 1.701324097951209e-05,
      "loss": 0.0018,
      "num_tokens": 62678505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 983
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.328,
      "grad_norm": 3.7311707501430647e-07,
      "kl": 0.04547119140625,
      "learning_rate": 1.7004941934160866e-05,
      "loss": 0.0018,
      "num_tokens": 62756361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 984
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3283333333333333,
      "grad_norm": 3.233733139040851e-07,
      "kl": 0.0484619140625,
      "learning_rate": 1.6996633405133656e-05,
      "loss": 0.0019,
      "num_tokens": 62833705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 985
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.32866666666666666,
      "grad_norm": 2.3835906404201523e-07,
      "kl": 0.043212890625,
      "learning_rate": 1.6988315403679e-05,
      "loss": 0.0017,
      "num_tokens": 62912665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 986
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.329,
      "grad_norm": 2.4506357476639096e-07,
      "kl": 0.04803466796875,
      "learning_rate": 1.6979987941058274e-05,
      "loss": 0.0019,
      "num_tokens": 62987369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 987
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3293333333333333,
      "grad_norm": 1.1797215648812198e-07,
      "kl": 0.0438232421875,
      "learning_rate": 1.697165102854565e-05,
      "loss": 0.0018,
      "num_tokens": 63061145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 988
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.32966666666666666,
      "grad_norm": 1.3758707950728422e-07,
      "kl": 0.04705810546875,
      "learning_rate": 1.6963304677428096e-05,
      "loss": 0.0019,
      "num_tokens": 63135881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 989
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.33,
      "grad_norm": 2.8284682684898144e-07,
      "kl": 0.04779052734375,
      "learning_rate": 1.6954948899005365e-05,
      "loss": 0.0019,
      "num_tokens": 63210393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 990
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3303333333333333,
      "grad_norm": 2.3840239293804188e-07,
      "kl": 0.0447998046875,
      "learning_rate": 1.6946583704589973e-05,
      "loss": 0.0018,
      "num_tokens": 63284905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 991
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.33066666666666666,
      "grad_norm": 2.231939362218327e-07,
      "kl": 0.0484619140625,
      "learning_rate": 1.6938209105507177e-05,
      "loss": 0.0019,
      "num_tokens": 63359929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 992
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.331,
      "grad_norm": 2.723217562561331e-07,
      "kl": 0.04864501953125,
      "learning_rate": 1.6929825113094972e-05,
      "loss": 0.0019,
      "num_tokens": 63433769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 993
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3313333333333333,
      "grad_norm": 1.314094362214746e-07,
      "kl": 0.04608154296875,
      "learning_rate": 1.692143173870407e-05,
      "loss": 0.0018,
      "num_tokens": 63509225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 994
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.33166666666666667,
      "grad_norm": 1.4864269815006992e-07,
      "kl": 0.048095703125,
      "learning_rate": 1.6913028993697877e-05,
      "loss": 0.0019,
      "num_tokens": 63583417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 995
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.332,
      "grad_norm": 2.8631365012188326e-07,
      "kl": 0.039306640625,
      "learning_rate": 1.6904616889452497e-05,
      "loss": 0.0016,
      "num_tokens": 63660713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 996
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3323333333333333,
      "grad_norm": 2.4666348963364726e-07,
      "kl": 0.0465087890625,
      "learning_rate": 1.68961954373567e-05,
      "loss": 0.0019,
      "num_tokens": 63736633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 997
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.33266666666666667,
      "grad_norm": 2.489578321274166e-07,
      "kl": 0.0477294921875,
      "learning_rate": 1.688776464881191e-05,
      "loss": 0.0019,
      "num_tokens": 63813257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 998
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.333,
      "grad_norm": 3.299307991255773e-07,
      "kl": 0.04364013671875,
      "learning_rate": 1.6879324535232186e-05,
      "loss": 0.0017,
      "num_tokens": 63889881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 999
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3333333333333333,
      "grad_norm": 2.708214310587209e-07,
      "kl": 0.04730224609375,
      "learning_rate": 1.6870875108044233e-05,
      "loss": 0.0019,
      "num_tokens": 63968505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1000
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.33366666666666667,
      "grad_norm": 2.6537884423305513e-07,
      "kl": 0.04595947265625,
      "learning_rate": 1.686241637868734e-05,
      "loss": 0.0018,
      "num_tokens": 64045689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1001
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.334,
      "grad_norm": 2.0357389018954564e-07,
      "kl": 0.0465087890625,
      "learning_rate": 1.68539483586134e-05,
      "loss": 0.0019,
      "num_tokens": 64121945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1002
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3343333333333333,
      "grad_norm": 2.830045104929013e-07,
      "kl": 0.04766845703125,
      "learning_rate": 1.684547105928689e-05,
      "loss": 0.0019,
      "num_tokens": 64197753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1003
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.33466666666666667,
      "grad_norm": 1.793147390571903e-07,
      "kl": 0.0467529296875,
      "learning_rate": 1.683698449218484e-05,
      "loss": 0.0019,
      "num_tokens": 64274313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1004
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.335,
      "grad_norm": 2.7238968414167175e-07,
      "kl": 0.0477294921875,
      "learning_rate": 1.6828488668796836e-05,
      "loss": 0.0019,
      "num_tokens": 64347177.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1005
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3353333333333333,
      "grad_norm": 2.3318362707414053e-07,
      "kl": 0.0455322265625,
      "learning_rate": 1.6819983600624986e-05,
      "loss": 0.0018,
      "num_tokens": 64424089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1006
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.33566666666666667,
      "grad_norm": 3.2506622460459766e-07,
      "kl": 0.04632568359375,
      "learning_rate": 1.6811469299183928e-05,
      "loss": 0.0019,
      "num_tokens": 64499353.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1007
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.336,
      "grad_norm": 3.011139426689624e-07,
      "kl": 0.049072265625,
      "learning_rate": 1.6802945776000782e-05,
      "loss": 0.002,
      "num_tokens": 64579081.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1008
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3363333333333333,
      "grad_norm": 1.725312444023075e-07,
      "kl": 0.046142578125,
      "learning_rate": 1.6794413042615168e-05,
      "loss": 0.0018,
      "num_tokens": 64654201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1009
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.33666666666666667,
      "grad_norm": 2.756699473138724e-07,
      "kl": 0.04437255859375,
      "learning_rate": 1.6785871110579167e-05,
      "loss": 0.0018,
      "num_tokens": 64731449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1010
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.337,
      "grad_norm": 1.4254494828946918e-07,
      "kl": 0.04620361328125,
      "learning_rate": 1.6777319991457325e-05,
      "loss": 0.0018,
      "num_tokens": 64805033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1011
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3373333333333333,
      "grad_norm": 2.3731524834147422e-07,
      "kl": 0.04608154296875,
      "learning_rate": 1.6768759696826608e-05,
      "loss": 0.0018,
      "num_tokens": 64882665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1012
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.33766666666666667,
      "grad_norm": 2.6397302121949906e-07,
      "kl": 0.04644775390625,
      "learning_rate": 1.6760190238276418e-05,
      "loss": 0.0019,
      "num_tokens": 64959097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1013
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.338,
      "grad_norm": 2.2125695409158652e-07,
      "kl": 0.04595947265625,
      "learning_rate": 1.6751611627408567e-05,
      "loss": 0.0018,
      "num_tokens": 65035481.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1014
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3383333333333333,
      "grad_norm": 1.3229606565801078e-07,
      "kl": 0.046142578125,
      "learning_rate": 1.6743023875837233e-05,
      "loss": 0.0018,
      "num_tokens": 65111961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1015
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.33866666666666667,
      "grad_norm": 1.9578828869271092e-07,
      "kl": 0.04437255859375,
      "learning_rate": 1.6734426995189003e-05,
      "loss": 0.0018,
      "num_tokens": 65185833.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1016
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.339,
      "grad_norm": 3.02549779007677e-07,
      "kl": 0.0491943359375,
      "learning_rate": 1.6725820997102804e-05,
      "loss": 0.002,
      "num_tokens": 65264809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1017
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3393333333333333,
      "grad_norm": 1.3841760448940477e-07,
      "kl": 0.0467529296875,
      "learning_rate": 1.6717205893229904e-05,
      "loss": 0.0019,
      "num_tokens": 65339465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1018
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3396666666666667,
      "grad_norm": 2.1331956645553873e-07,
      "kl": 0.0465087890625,
      "learning_rate": 1.670858169523391e-05,
      "loss": 0.0019,
      "num_tokens": 65416201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1019
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.34,
      "grad_norm": 2.119055579896667e-07,
      "kl": 0.0433349609375,
      "learning_rate": 1.6699948414790734e-05,
      "loss": 0.0017,
      "num_tokens": 65492841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1020
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3403333333333333,
      "grad_norm": 1.1064967964102834e-07,
      "kl": 0.04364013671875,
      "learning_rate": 1.6691306063588583e-05,
      "loss": 0.0017,
      "num_tokens": 65566569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1021
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3406666666666667,
      "grad_norm": 2.98130061082702e-07,
      "kl": 0.04742431640625,
      "learning_rate": 1.6682654653327953e-05,
      "loss": 0.0019,
      "num_tokens": 65641817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1022
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.341,
      "grad_norm": 2.5687106131044857e-07,
      "kl": 0.0467529296875,
      "learning_rate": 1.66739941957216e-05,
      "loss": 0.0019,
      "num_tokens": 65719401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1023
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3413333333333333,
      "grad_norm": 3.3902935570040427e-07,
      "kl": 0.047607421875,
      "learning_rate": 1.6665324702494524e-05,
      "loss": 0.0019,
      "num_tokens": 65795209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1024
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3416666666666667,
      "grad_norm": 1.897923596061446e-07,
      "kl": 0.0457763671875,
      "learning_rate": 1.665664618538397e-05,
      "loss": 0.0018,
      "num_tokens": 65870409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1025
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.342,
      "grad_norm": 2.997032311213843e-07,
      "kl": 0.04754638671875,
      "learning_rate": 1.6647958656139377e-05,
      "loss": 0.0019,
      "num_tokens": 65946377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1026
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3423333333333333,
      "grad_norm": 1.877434669950162e-07,
      "kl": 0.04217529296875,
      "learning_rate": 1.6639262126522417e-05,
      "loss": 0.0017,
      "num_tokens": 66020441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1027
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3426666666666667,
      "grad_norm": 2.0538445255624538e-07,
      "kl": 0.0452880859375,
      "learning_rate": 1.663055660830692e-05,
      "loss": 0.0018,
      "num_tokens": 66096441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1028
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.343,
      "grad_norm": 2.991582164213469e-07,
      "kl": 0.04833984375,
      "learning_rate": 1.6621842113278902e-05,
      "loss": 0.0019,
      "num_tokens": 66171641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1029
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3433333333333333,
      "grad_norm": 1.7526781448395923e-07,
      "kl": 0.04840087890625,
      "learning_rate": 1.661311865323652e-05,
      "loss": 0.0019,
      "num_tokens": 66245401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1030
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3436666666666667,
      "grad_norm": 2.1655398541042814e-07,
      "kl": 0.043701171875,
      "learning_rate": 1.6604386239990077e-05,
      "loss": 0.0017,
      "num_tokens": 66319401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1031
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.344,
      "grad_norm": 3.229246203773073e-07,
      "kl": 0.0458984375,
      "learning_rate": 1.6595644885362e-05,
      "loss": 0.0018,
      "num_tokens": 66396361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1032
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3443333333333333,
      "grad_norm": 3.012336833307927e-07,
      "kl": 0.0472412109375,
      "learning_rate": 1.6586894601186804e-05,
      "loss": 0.0019,
      "num_tokens": 66471529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1033
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3446666666666667,
      "grad_norm": 2.6796817564900266e-07,
      "kl": 0.0479736328125,
      "learning_rate": 1.657813539931112e-05,
      "loss": 0.0019,
      "num_tokens": 66545801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1034
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.345,
      "grad_norm": 2.4642193352519826e-07,
      "kl": 0.0455322265625,
      "learning_rate": 1.6569367291593627e-05,
      "loss": 0.0018,
      "num_tokens": 66622553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1035
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3453333333333333,
      "grad_norm": 1.6899959121019492e-07,
      "kl": 0.0440673828125,
      "learning_rate": 1.6560590289905074e-05,
      "loss": 0.0018,
      "num_tokens": 66697433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1036
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3456666666666667,
      "grad_norm": 1.9668938477934717e-07,
      "kl": 0.05108642578125,
      "learning_rate": 1.655180440612825e-05,
      "loss": 0.002,
      "num_tokens": 66772169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1037
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.346,
      "grad_norm": 1.58758268753445e-07,
      "kl": 0.0435791015625,
      "learning_rate": 1.6543009652157973e-05,
      "loss": 0.0017,
      "num_tokens": 66848745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1038
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3463333333333333,
      "grad_norm": 1.5893020588464424e-07,
      "kl": 0.046875,
      "learning_rate": 1.6534206039901057e-05,
      "loss": 0.0019,
      "num_tokens": 66923465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1039
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3466666666666667,
      "grad_norm": 2.36673670883647e-07,
      "kl": 0.0457763671875,
      "learning_rate": 1.652539358127632e-05,
      "loss": 0.0018,
      "num_tokens": 66997833.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1040
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.347,
      "grad_norm": 2.716946028158418e-07,
      "kl": 0.046630859375,
      "learning_rate": 1.6516572288214555e-05,
      "loss": 0.0019,
      "num_tokens": 67073945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1041
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3473333333333333,
      "grad_norm": 2.588246559298568e-07,
      "kl": 0.04364013671875,
      "learning_rate": 1.650774217265851e-05,
      "loss": 0.0017,
      "num_tokens": 67151097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1042
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3476666666666667,
      "grad_norm": 2.486681296431925e-07,
      "kl": 0.0411376953125,
      "learning_rate": 1.649890324656289e-05,
      "loss": 0.0016,
      "num_tokens": 67227161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1043
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.348,
      "grad_norm": 3.447154313107603e-07,
      "kl": 0.0457763671875,
      "learning_rate": 1.649005552189431e-05,
      "loss": 0.0018,
      "num_tokens": 67304825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1044
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.34833333333333333,
      "grad_norm": 1.6775233291355107e-07,
      "kl": 0.044921875,
      "learning_rate": 1.6481199010631312e-05,
      "loss": 0.0018,
      "num_tokens": 67382937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1045
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3486666666666667,
      "grad_norm": 1.9476284762731666e-07,
      "kl": 0.04541015625,
      "learning_rate": 1.6472333724764326e-05,
      "loss": 0.0018,
      "num_tokens": 67457593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1046
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.349,
      "grad_norm": 3.570832802779478e-07,
      "kl": 0.045166015625,
      "learning_rate": 1.6463459676295666e-05,
      "loss": 0.0018,
      "num_tokens": 67534121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1047
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.34933333333333333,
      "grad_norm": 1.379593896899678e-07,
      "kl": 0.0467529296875,
      "learning_rate": 1.645457687723951e-05,
      "loss": 0.0019,
      "num_tokens": 67609129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1048
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3496666666666667,
      "grad_norm": 3.7264581465024094e-07,
      "kl": 0.046142578125,
      "learning_rate": 1.644568533962187e-05,
      "loss": 0.0018,
      "num_tokens": 67684617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1049
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.35,
      "grad_norm": 1.9051678634696145e-07,
      "kl": 0.04595947265625,
      "learning_rate": 1.643678507548061e-05,
      "loss": 0.0018,
      "num_tokens": 67759161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1050
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.35033333333333333,
      "grad_norm": 2.8085653980269853e-07,
      "kl": 0.046630859375,
      "learning_rate": 1.6427876096865394e-05,
      "loss": 0.0019,
      "num_tokens": 67835753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1051
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3506666666666667,
      "grad_norm": 2.3154112227530277e-07,
      "kl": 0.04364013671875,
      "learning_rate": 1.6418958415837688e-05,
      "loss": 0.0017,
      "num_tokens": 67909977.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1052
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.351,
      "grad_norm": 2.1219780421688483e-07,
      "kl": 0.047607421875,
      "learning_rate": 1.6410032044470735e-05,
      "loss": 0.0019,
      "num_tokens": 67983993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1053
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.35133333333333333,
      "grad_norm": 3.1413446777150966e-07,
      "kl": 0.04327392578125,
      "learning_rate": 1.6401096994849558e-05,
      "loss": 0.0017,
      "num_tokens": 68060505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1054
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3516666666666667,
      "grad_norm": 3.213079935449059e-07,
      "kl": 0.0457763671875,
      "learning_rate": 1.6392153279070905e-05,
      "loss": 0.0018,
      "num_tokens": 68137689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1055
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.352,
      "grad_norm": 1.5866876879044867e-07,
      "kl": 0.0489501953125,
      "learning_rate": 1.6383200909243285e-05,
      "loss": 0.002,
      "num_tokens": 68210649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1056
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.35233333333333333,
      "grad_norm": 1.5397878883049998e-07,
      "kl": 0.04254150390625,
      "learning_rate": 1.63742398974869e-05,
      "loss": 0.0017,
      "num_tokens": 68284217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1057
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3526666666666667,
      "grad_norm": 1.849784183605152e-07,
      "kl": 0.04437255859375,
      "learning_rate": 1.6365270255933663e-05,
      "loss": 0.0018,
      "num_tokens": 68359449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1058
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.353,
      "grad_norm": 3.638995735855133e-07,
      "kl": 0.0435791015625,
      "learning_rate": 1.635629199672717e-05,
      "loss": 0.0017,
      "num_tokens": 68436297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1059
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.35333333333333333,
      "grad_norm": 1.8141595603538008e-07,
      "kl": 0.04052734375,
      "learning_rate": 1.6347305132022677e-05,
      "loss": 0.0016,
      "num_tokens": 68511897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1060
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3536666666666667,
      "grad_norm": 1.6439460637229786e-07,
      "kl": 0.04345703125,
      "learning_rate": 1.63383096739871e-05,
      "loss": 0.0017,
      "num_tokens": 68587721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1061
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.354,
      "grad_norm": 2.125379552353479e-07,
      "kl": 0.041259765625,
      "learning_rate": 1.6329305634798993e-05,
      "loss": 0.0016,
      "num_tokens": 68661401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1062
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.35433333333333333,
      "grad_norm": 2.076277070273136e-07,
      "kl": 0.0462646484375,
      "learning_rate": 1.632029302664851e-05,
      "loss": 0.0019,
      "num_tokens": 68737641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1063
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3546666666666667,
      "grad_norm": 1.4812604831604403e-07,
      "kl": 0.0447998046875,
      "learning_rate": 1.6311271861737417e-05,
      "loss": 0.0018,
      "num_tokens": 68811705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1064
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.355,
      "grad_norm": 1.0499272207198374e-07,
      "kl": 0.04449462890625,
      "learning_rate": 1.6302242152279068e-05,
      "loss": 0.0018,
      "num_tokens": 68884681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1065
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.35533333333333333,
      "grad_norm": 1.536605083174436e-07,
      "kl": 0.0458984375,
      "learning_rate": 1.6293203910498375e-05,
      "loss": 0.0018,
      "num_tokens": 68957993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1066
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3556666666666667,
      "grad_norm": 0.0003515103890094906,
      "kl": 0.0419921875,
      "learning_rate": 1.6284157148631814e-05,
      "loss": 0.0017,
      "num_tokens": 69035865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1067
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.356,
      "grad_norm": 2.0275901135846652e-07,
      "kl": 0.04437255859375,
      "learning_rate": 1.6275101878927382e-05,
      "loss": 0.0018,
      "num_tokens": 69110729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1068
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.35633333333333334,
      "grad_norm": 3.8990665984783845e-07,
      "kl": 0.0482177734375,
      "learning_rate": 1.6266038113644605e-05,
      "loss": 0.0019,
      "num_tokens": 69184857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1069
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3566666666666667,
      "grad_norm": 9.444265458569134e-08,
      "kl": 0.0411376953125,
      "learning_rate": 1.625696586505451e-05,
      "loss": 0.0016,
      "num_tokens": 69259337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1070
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.357,
      "grad_norm": 2.391022917436203e-07,
      "kl": 0.04437255859375,
      "learning_rate": 1.6247885145439602e-05,
      "loss": 0.0018,
      "num_tokens": 69333945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1071
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.35733333333333334,
      "grad_norm": 2.569869081980869e-07,
      "kl": 0.0452880859375,
      "learning_rate": 1.6238795967093865e-05,
      "loss": 0.0018,
      "num_tokens": 69410825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1072
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3576666666666667,
      "grad_norm": 2.1654653892255737e-07,
      "kl": 0.04815673828125,
      "learning_rate": 1.622969834232272e-05,
      "loss": 0.0019,
      "num_tokens": 69484105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1073
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.358,
      "grad_norm": 2.987639504681283e-07,
      "kl": 0.0433349609375,
      "learning_rate": 1.622059228344304e-05,
      "loss": 0.0017,
      "num_tokens": 69561193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1074
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.35833333333333334,
      "grad_norm": 1.5380136630938068e-07,
      "kl": 0.04339599609375,
      "learning_rate": 1.6211477802783105e-05,
      "loss": 0.0017,
      "num_tokens": 69635833.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1075
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3586666666666667,
      "grad_norm": 2.7958495252278226e-07,
      "kl": 0.044677734375,
      "learning_rate": 1.6202354912682602e-05,
      "loss": 0.0018,
      "num_tokens": 69712041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1076
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.359,
      "grad_norm": 3.8320197859320615e-07,
      "kl": 0.0447998046875,
      "learning_rate": 1.6193223625492604e-05,
      "loss": 0.0018,
      "num_tokens": 69788569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1077
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.35933333333333334,
      "grad_norm": 3.4159901929342595e-07,
      "kl": 0.045654296875,
      "learning_rate": 1.6184083953575543e-05,
      "loss": 0.0018,
      "num_tokens": 69870809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1078
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3596666666666667,
      "grad_norm": 2.1942899763871537e-07,
      "kl": 0.052734375,
      "learning_rate": 1.6174935909305216e-05,
      "loss": 0.0021,
      "num_tokens": 69947705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1079
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36,
      "grad_norm": 1.8553926395270537e-07,
      "kl": 0.042236328125,
      "learning_rate": 1.616577950506675e-05,
      "loss": 0.0017,
      "num_tokens": 70022473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1080
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36033333333333334,
      "grad_norm": 3.3929265441656753e-07,
      "kl": 0.0478515625,
      "learning_rate": 1.6156614753256583e-05,
      "loss": 0.0019,
      "num_tokens": 70099321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1081
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3606666666666667,
      "grad_norm": 3.1992675531000714e-07,
      "kl": 0.0438232421875,
      "learning_rate": 1.614744166628247e-05,
      "loss": 0.0018,
      "num_tokens": 70175673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1082
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.361,
      "grad_norm": 2.317240017646327e-07,
      "kl": 0.04638671875,
      "learning_rate": 1.613826025656343e-05,
      "loss": 0.0019,
      "num_tokens": 70253801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1083
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36133333333333334,
      "grad_norm": 1.5721208512786689e-07,
      "kl": 0.0457763671875,
      "learning_rate": 1.6129070536529767e-05,
      "loss": 0.0018,
      "num_tokens": 70328233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1084
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3616666666666667,
      "grad_norm": 1.594924441405965e-07,
      "kl": 0.0447998046875,
      "learning_rate": 1.611987251862303e-05,
      "loss": 0.0018,
      "num_tokens": 70403433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1085
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.362,
      "grad_norm": 2.429415246751887e-07,
      "kl": 0.04754638671875,
      "learning_rate": 1.6110666215296e-05,
      "loss": 0.0019,
      "num_tokens": 70480297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1086
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36233333333333334,
      "grad_norm": 2.861544032839447e-07,
      "kl": 0.04266357421875,
      "learning_rate": 1.610145163901268e-05,
      "loss": 0.0017,
      "num_tokens": 70556105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1087
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3626666666666667,
      "grad_norm": 1.616263176629218e-07,
      "kl": 0.0445556640625,
      "learning_rate": 1.6092228802248264e-05,
      "loss": 0.0018,
      "num_tokens": 70629801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1088
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.363,
      "grad_norm": 2.057961694390542e-07,
      "kl": 0.0465087890625,
      "learning_rate": 1.6082997717489145e-05,
      "loss": 0.0019,
      "num_tokens": 70707177.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1089
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36333333333333334,
      "grad_norm": 2.0469707351367106e-07,
      "kl": 0.046875,
      "learning_rate": 1.607375839723287e-05,
      "loss": 0.0019,
      "num_tokens": 70781705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1090
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3636666666666667,
      "grad_norm": 2.581525961886655e-07,
      "kl": 0.04473876953125,
      "learning_rate": 1.6064510853988137e-05,
      "loss": 0.0018,
      "num_tokens": 70858025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1091
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.364,
      "grad_norm": 1.979224180104211e-07,
      "kl": 0.044921875,
      "learning_rate": 1.605525510027478e-05,
      "loss": 0.0018,
      "num_tokens": 70934425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1092
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36433333333333334,
      "grad_norm": 1.7411876740425214e-07,
      "kl": 0.04656982421875,
      "learning_rate": 1.6045991148623752e-05,
      "loss": 0.0019,
      "num_tokens": 71010345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1093
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36466666666666664,
      "grad_norm": 1.1158581258996492e-07,
      "kl": 0.04327392578125,
      "learning_rate": 1.6036719011577094e-05,
      "loss": 0.0017,
      "num_tokens": 71083833.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1094
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.365,
      "grad_norm": 2.1587241860743234e-07,
      "kl": 0.0477294921875,
      "learning_rate": 1.6027438701687937e-05,
      "loss": 0.0019,
      "num_tokens": 71162681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1095
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36533333333333334,
      "grad_norm": 2.580104307980946e-07,
      "kl": 0.047119140625,
      "learning_rate": 1.6018150231520486e-05,
      "loss": 0.0019,
      "num_tokens": 71239001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1096
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36566666666666664,
      "grad_norm": 2.466824184921279e-07,
      "kl": 0.0445556640625,
      "learning_rate": 1.600885361364997e-05,
      "loss": 0.0018,
      "num_tokens": 71313497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1097
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.366,
      "grad_norm": 1.476787048204642e-07,
      "kl": 0.04803466796875,
      "learning_rate": 1.5999548860662666e-05,
      "loss": 0.0019,
      "num_tokens": 71387737.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1098
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36633333333333334,
      "grad_norm": 2.01558890466913e-07,
      "kl": 0.04541015625,
      "learning_rate": 1.599023598515586e-05,
      "loss": 0.0018,
      "num_tokens": 71462217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1099
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36666666666666664,
      "grad_norm": 1.8711165239437832e-07,
      "kl": 0.044921875,
      "learning_rate": 1.598091499973784e-05,
      "loss": 0.0018,
      "num_tokens": 71536489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1100
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.367,
      "grad_norm": 1.7722781819884403e-07,
      "kl": 0.0440673828125,
      "learning_rate": 1.5971585917027864e-05,
      "loss": 0.0018,
      "num_tokens": 71611337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1101
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36733333333333335,
      "grad_norm": 1.782724154963944e-07,
      "kl": 0.046142578125,
      "learning_rate": 1.5962248749656158e-05,
      "loss": 0.0018,
      "num_tokens": 71687113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1102
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36766666666666664,
      "grad_norm": 1.560152611546073e-07,
      "kl": 0.04559326171875,
      "learning_rate": 1.59529035102639e-05,
      "loss": 0.0018,
      "num_tokens": 71762393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1103
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.368,
      "grad_norm": 3.2420365414509433e-07,
      "kl": 0.04632568359375,
      "learning_rate": 1.594355021150318e-05,
      "loss": 0.0019,
      "num_tokens": 71837897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1104
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36833333333333335,
      "grad_norm": 2.2718381842423696e-07,
      "kl": 0.0438232421875,
      "learning_rate": 1.5934188866037017e-05,
      "loss": 0.0018,
      "num_tokens": 71913209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1105
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36866666666666664,
      "grad_norm": 2.565647605479171e-07,
      "kl": 0.0443115234375,
      "learning_rate": 1.592481948653931e-05,
      "loss": 0.0018,
      "num_tokens": 71988409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1106
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.369,
      "grad_norm": 3.3098220342253626e-07,
      "kl": 0.046142578125,
      "learning_rate": 1.591544208569484e-05,
      "loss": 0.0018,
      "num_tokens": 72069433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1107
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36933333333333335,
      "grad_norm": 1.6562962912303192e-07,
      "kl": 0.045654296875,
      "learning_rate": 1.5906056676199256e-05,
      "loss": 0.0018,
      "num_tokens": 72147337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1108
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.36966666666666664,
      "grad_norm": 1.5054055779728515e-07,
      "kl": 0.0433349609375,
      "learning_rate": 1.5896663270759034e-05,
      "loss": 0.0017,
      "num_tokens": 72221961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1109
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37,
      "grad_norm": 2.499229196928354e-07,
      "kl": 0.043212890625,
      "learning_rate": 1.5887261882091488e-05,
      "loss": 0.0017,
      "num_tokens": 72297225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1110
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37033333333333335,
      "grad_norm": 1.498172537139908e-07,
      "kl": 0.04901123046875,
      "learning_rate": 1.5877852522924733e-05,
      "loss": 0.002,
      "num_tokens": 72371897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1111
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37066666666666664,
      "grad_norm": 2.1787293746911018e-07,
      "kl": 0.04730224609375,
      "learning_rate": 1.586843520599768e-05,
      "loss": 0.0019,
      "num_tokens": 72447865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1112
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.371,
      "grad_norm": 1.8467947882072622e-07,
      "kl": 0.04278564453125,
      "learning_rate": 1.5859009944060005e-05,
      "loss": 0.0017,
      "num_tokens": 72524617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1113
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37133333333333335,
      "grad_norm": 1.8448743333010498e-07,
      "kl": 0.0465087890625,
      "learning_rate": 1.584957674987216e-05,
      "loss": 0.0019,
      "num_tokens": 72600009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1114
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37166666666666665,
      "grad_norm": 1.733871641818041e-07,
      "kl": 0.04290771484375,
      "learning_rate": 1.5840135636205305e-05,
      "loss": 0.0017,
      "num_tokens": 72674601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1115
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.372,
      "grad_norm": 3.3537230592628475e-07,
      "kl": 0.051025390625,
      "learning_rate": 1.5830686615841348e-05,
      "loss": 0.002,
      "num_tokens": 72752153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1116
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37233333333333335,
      "grad_norm": 2.052071295111091e-07,
      "kl": 0.0457763671875,
      "learning_rate": 1.5821229701572897e-05,
      "loss": 0.0018,
      "num_tokens": 72826473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1117
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37266666666666665,
      "grad_norm": 4.1592105048948724e-07,
      "kl": 0.046875,
      "learning_rate": 1.5811764906203235e-05,
      "loss": 0.0019,
      "num_tokens": 72903049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1118
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.373,
      "grad_norm": 2.8275951535761124e-07,
      "kl": 0.05181884765625,
      "learning_rate": 1.580229224254633e-05,
      "loss": 0.0021,
      "num_tokens": 72980441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1119
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37333333333333335,
      "grad_norm": 2.4958129074548197e-07,
      "kl": 0.0465087890625,
      "learning_rate": 1.5792811723426787e-05,
      "loss": 0.0019,
      "num_tokens": 73055465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1120
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37366666666666665,
      "grad_norm": 2.344265084275321e-07,
      "kl": 0.04681396484375,
      "learning_rate": 1.5783323361679865e-05,
      "loss": 0.0019,
      "num_tokens": 73131529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1121
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.374,
      "grad_norm": 1.1609753158836611e-07,
      "kl": 0.04510498046875,
      "learning_rate": 1.5773827170151425e-05,
      "loss": 0.0018,
      "num_tokens": 73209001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1122
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37433333333333335,
      "grad_norm": 1.638546081039749e-07,
      "kl": 0.04376220703125,
      "learning_rate": 1.5764323161697933e-05,
      "loss": 0.0018,
      "num_tokens": 73283513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1123
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37466666666666665,
      "grad_norm": 1.9412206597735349e-07,
      "kl": 0.04345703125,
      "learning_rate": 1.5754811349186443e-05,
      "loss": 0.0017,
      "num_tokens": 73359273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1124
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.375,
      "grad_norm": 1.7888619652239868e-07,
      "kl": 0.0482177734375,
      "learning_rate": 1.5745291745494563e-05,
      "loss": 0.0019,
      "num_tokens": 73434777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1125
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37533333333333335,
      "grad_norm": 2.2083835915509553e-07,
      "kl": 0.04522705078125,
      "learning_rate": 1.573576436351046e-05,
      "loss": 0.0018,
      "num_tokens": 73509593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1126
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37566666666666665,
      "grad_norm": 2.259782405644728e-07,
      "kl": 0.04559326171875,
      "learning_rate": 1.5726229216132835e-05,
      "loss": 0.0018,
      "num_tokens": 73587641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1127
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.376,
      "grad_norm": 1.266240872155322e-07,
      "kl": 0.050048828125,
      "learning_rate": 1.5716686316270884e-05,
      "loss": 0.002,
      "num_tokens": 73661913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1128
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37633333333333335,
      "grad_norm": 1.7839907684447098e-07,
      "kl": 0.04559326171875,
      "learning_rate": 1.570713567684432e-05,
      "loss": 0.0018,
      "num_tokens": 73737017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1129
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37666666666666665,
      "grad_norm": 2.0319306770488765e-07,
      "kl": 0.04742431640625,
      "learning_rate": 1.5697577310783318e-05,
      "loss": 0.0019,
      "num_tokens": 73811897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1130
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.377,
      "grad_norm": 2.0491488328389096e-07,
      "kl": 0.047607421875,
      "learning_rate": 1.568801123102852e-05,
      "loss": 0.0019,
      "num_tokens": 73886073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1131
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37733333333333335,
      "grad_norm": 2.960838685339695e-07,
      "kl": 0.04779052734375,
      "learning_rate": 1.5678437450531014e-05,
      "loss": 0.0019,
      "num_tokens": 73963897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1132
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37766666666666665,
      "grad_norm": 2.704465487113339e-07,
      "kl": 0.04620361328125,
      "learning_rate": 1.566885598225231e-05,
      "loss": 0.0018,
      "num_tokens": 74041833.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1133
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.378,
      "grad_norm": 2.1534745542339806e-07,
      "kl": 0.044921875,
      "learning_rate": 1.565926683916433e-05,
      "loss": 0.0018,
      "num_tokens": 74117753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1134
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37833333333333335,
      "grad_norm": 2.9199301820881374e-07,
      "kl": 0.045166015625,
      "learning_rate": 1.564967003424938e-05,
      "loss": 0.0018,
      "num_tokens": 74198281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1135
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37866666666666665,
      "grad_norm": 1.2356058221030253e-07,
      "kl": 0.043701171875,
      "learning_rate": 1.5640065580500146e-05,
      "loss": 0.0017,
      "num_tokens": 74273209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1136
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.379,
      "grad_norm": 1.8146216973491391e-07,
      "kl": 0.04718017578125,
      "learning_rate": 1.5630453490919663e-05,
      "loss": 0.0019,
      "num_tokens": 74349257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1137
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37933333333333336,
      "grad_norm": 2.1285757156874752e-07,
      "kl": 0.04803466796875,
      "learning_rate": 1.5620833778521306e-05,
      "loss": 0.0019,
      "num_tokens": 74426537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1138
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.37966666666666665,
      "grad_norm": 2.053234311460983e-07,
      "kl": 0.0443115234375,
      "learning_rate": 1.561120645632878e-05,
      "loss": 0.0018,
      "num_tokens": 74502537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1139
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38,
      "grad_norm": 2.053631504850273e-07,
      "kl": 0.04290771484375,
      "learning_rate": 1.560157153737607e-05,
      "loss": 0.0017,
      "num_tokens": 74575433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1140
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38033333333333336,
      "grad_norm": 1.65422747500088e-07,
      "kl": 0.0465087890625,
      "learning_rate": 1.5591929034707468e-05,
      "loss": 0.0019,
      "num_tokens": 74650553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1141
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38066666666666665,
      "grad_norm": 1.960008546575409e-07,
      "kl": 0.04534912109375,
      "learning_rate": 1.5582278961377524e-05,
      "loss": 0.0018,
      "num_tokens": 74725641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1142
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.381,
      "grad_norm": 2.108239982590021e-07,
      "kl": 0.0445556640625,
      "learning_rate": 1.5572621330451044e-05,
      "loss": 0.0018,
      "num_tokens": 74802345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1143
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38133333333333336,
      "grad_norm": 2.5135875603155e-07,
      "kl": 0.04498291015625,
      "learning_rate": 1.556295615500305e-05,
      "loss": 0.0018,
      "num_tokens": 74878633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1144
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38166666666666665,
      "grad_norm": 1.6932575874761824e-07,
      "kl": 0.045166015625,
      "learning_rate": 1.5553283448118795e-05,
      "loss": 0.0018,
      "num_tokens": 74954441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1145
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.382,
      "grad_norm": 2.2136832455998956e-07,
      "kl": 0.04522705078125,
      "learning_rate": 1.5543603222893718e-05,
      "loss": 0.0018,
      "num_tokens": 75031225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1146
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38233333333333336,
      "grad_norm": 9.68626707731346e-08,
      "kl": 0.04071044921875,
      "learning_rate": 1.553391549243344e-05,
      "loss": 0.0016,
      "num_tokens": 75111865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1147
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38266666666666665,
      "grad_norm": 3.990126060671173e-07,
      "kl": 0.046875,
      "learning_rate": 1.5524220269853754e-05,
      "loss": 0.0019,
      "num_tokens": 75189449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1148
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.383,
      "grad_norm": 1.8346602814744983e-07,
      "kl": 0.04193115234375,
      "learning_rate": 1.5514517568280573e-05,
      "loss": 0.0017,
      "num_tokens": 75266153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1149
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38333333333333336,
      "grad_norm": 2.1014508888583805e-07,
      "kl": 0.0460205078125,
      "learning_rate": 1.5504807400849957e-05,
      "loss": 0.0018,
      "num_tokens": 75340585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1150
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38366666666666666,
      "grad_norm": 1.6201340713450918e-07,
      "kl": 0.0462646484375,
      "learning_rate": 1.5495089780708062e-05,
      "loss": 0.0019,
      "num_tokens": 75414937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1151
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.384,
      "grad_norm": 1.5455056256996613e-07,
      "kl": 0.0445556640625,
      "learning_rate": 1.548536472101114e-05,
      "loss": 0.0018,
      "num_tokens": 75489113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1152
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38433333333333336,
      "grad_norm": 2.0146632095929817e-07,
      "kl": 0.0430908203125,
      "learning_rate": 1.5475632234925505e-05,
      "loss": 0.0017,
      "num_tokens": 75564265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1153
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38466666666666666,
      "grad_norm": 2.7232809429733607e-07,
      "kl": 0.049072265625,
      "learning_rate": 1.5465892335627537e-05,
      "loss": 0.002,
      "num_tokens": 75640489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1154
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.385,
      "grad_norm": 1.070246256063001e-07,
      "kl": 0.04351806640625,
      "learning_rate": 1.545614503630365e-05,
      "loss": 0.0017,
      "num_tokens": 75714009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1155
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38533333333333336,
      "grad_norm": 1.6892342102892144e-07,
      "kl": 0.04608154296875,
      "learning_rate": 1.5446390350150272e-05,
      "loss": 0.0018,
      "num_tokens": 75789113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1156
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38566666666666666,
      "grad_norm": 1.3307726476341486e-07,
      "kl": 0.04327392578125,
      "learning_rate": 1.5436628290373835e-05,
      "loss": 0.0017,
      "num_tokens": 75867945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1157
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.386,
      "grad_norm": 2.0743829054481466e-07,
      "kl": 0.04345703125,
      "learning_rate": 1.542685887019075e-05,
      "loss": 0.0017,
      "num_tokens": 75943657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1158
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3863333333333333,
      "grad_norm": 2.1642328817961243e-07,
      "kl": 0.047119140625,
      "learning_rate": 1.54170821028274e-05,
      "loss": 0.0019,
      "num_tokens": 76020585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1159
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38666666666666666,
      "grad_norm": 1.1275442091118748e-07,
      "kl": 0.04425048828125,
      "learning_rate": 1.5407298001520108e-05,
      "loss": 0.0018,
      "num_tokens": 76097641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1160
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.387,
      "grad_norm": 1.664246696009286e-07,
      "kl": 0.04681396484375,
      "learning_rate": 1.539750657951513e-05,
      "loss": 0.0019,
      "num_tokens": 76182489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1161
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3873333333333333,
      "grad_norm": 1.529209043837909e-07,
      "kl": 0.04425048828125,
      "learning_rate": 1.5387707850068633e-05,
      "loss": 0.0018,
      "num_tokens": 76260089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1162
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38766666666666666,
      "grad_norm": 1.7008896691095288e-07,
      "kl": 0.04443359375,
      "learning_rate": 1.5377901826446672e-05,
      "loss": 0.0018,
      "num_tokens": 76333945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1163
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.388,
      "grad_norm": 1.8821098990429164e-07,
      "kl": 0.04425048828125,
      "learning_rate": 1.5368088521925185e-05,
      "loss": 0.0018,
      "num_tokens": 76410937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1164
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3883333333333333,
      "grad_norm": 2.3498549239775457e-07,
      "kl": 0.04876708984375,
      "learning_rate": 1.5358267949789968e-05,
      "loss": 0.002,
      "num_tokens": 76489929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1165
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38866666666666666,
      "grad_norm": 1.8069387408559123e-07,
      "kl": 0.04754638671875,
      "learning_rate": 1.5348440123336647e-05,
      "loss": 0.0019,
      "num_tokens": 76566425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1166
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.389,
      "grad_norm": 1.5659249186228408e-07,
      "kl": 0.046875,
      "learning_rate": 1.533860505587067e-05,
      "loss": 0.0019,
      "num_tokens": 76644281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1167
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3893333333333333,
      "grad_norm": 1.54360108695073e-07,
      "kl": 0.04571533203125,
      "learning_rate": 1.53287627607073e-05,
      "loss": 0.0018,
      "num_tokens": 76719577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1168
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.38966666666666666,
      "grad_norm": 1.8486534258954634e-07,
      "kl": 0.04998779296875,
      "learning_rate": 1.531891325117158e-05,
      "loss": 0.002,
      "num_tokens": 76795849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1169
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.39,
      "grad_norm": 4.238993369654054e-07,
      "kl": 0.04833984375,
      "learning_rate": 1.530905654059831e-05,
      "loss": 0.0019,
      "num_tokens": 76873849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1170
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3903333333333333,
      "grad_norm": 2.80409778952162e-07,
      "kl": 0.047607421875,
      "learning_rate": 1.529919264233205e-05,
      "loss": 0.0019,
      "num_tokens": 76949305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1171
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.39066666666666666,
      "grad_norm": 2.9063102147119935e-07,
      "kl": 0.04736328125,
      "learning_rate": 1.5289321569727093e-05,
      "loss": 0.0019,
      "num_tokens": 77025401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1172
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.391,
      "grad_norm": 4.264112476448645e-07,
      "kl": 0.046875,
      "learning_rate": 1.5279443336147437e-05,
      "loss": 0.0019,
      "num_tokens": 77103433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1173
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3913333333333333,
      "grad_norm": 1.6834538030252588e-07,
      "kl": 0.04571533203125,
      "learning_rate": 1.5269557954966777e-05,
      "loss": 0.0018,
      "num_tokens": 77178249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1174
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.39166666666666666,
      "grad_norm": 3.0631557024207723e-07,
      "kl": 0.04766845703125,
      "learning_rate": 1.525966543956849e-05,
      "loss": 0.0019,
      "num_tokens": 77254729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1175
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.392,
      "grad_norm": 2.1066006183900754e-07,
      "kl": 0.04644775390625,
      "learning_rate": 1.5249765803345602e-05,
      "loss": 0.0019,
      "num_tokens": 77331001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1176
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3923333333333333,
      "grad_norm": 1.3322177494501375e-07,
      "kl": 0.046142578125,
      "learning_rate": 1.5239859059700794e-05,
      "loss": 0.0018,
      "num_tokens": 77404425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1177
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.39266666666666666,
      "grad_norm": 2.616755807594018e-07,
      "kl": 0.04180908203125,
      "learning_rate": 1.5229945222046354e-05,
      "loss": 0.0017,
      "num_tokens": 77481529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1178
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.393,
      "grad_norm": 2.779126191398973e-07,
      "kl": 0.044677734375,
      "learning_rate": 1.5220024303804181e-05,
      "loss": 0.0018,
      "num_tokens": 77559561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1179
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3933333333333333,
      "grad_norm": 2.0741408945923467e-07,
      "kl": 0.04693603515625,
      "learning_rate": 1.5210096318405768e-05,
      "loss": 0.0019,
      "num_tokens": 77635225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1180
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.39366666666666666,
      "grad_norm": 3.0884777402206964e-07,
      "kl": 0.04534912109375,
      "learning_rate": 1.5200161279292154e-05,
      "loss": 0.0018,
      "num_tokens": 77711577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1181
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.394,
      "grad_norm": 0.00032706017373129725,
      "kl": 0.046142578125,
      "learning_rate": 1.5190219199913956e-05,
      "loss": 0.0018,
      "num_tokens": 77790569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1182
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3943333333333333,
      "grad_norm": 2.1506916425551026e-07,
      "kl": 0.04388427734375,
      "learning_rate": 1.5180270093731305e-05,
      "loss": 0.0018,
      "num_tokens": 77866937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1183
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.39466666666666667,
      "grad_norm": 2.786361790185765e-07,
      "kl": 0.04412841796875,
      "learning_rate": 1.5170313974213841e-05,
      "loss": 0.0018,
      "num_tokens": 77943721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1184
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.395,
      "grad_norm": 2.4016597421905317e-07,
      "kl": 0.04547119140625,
      "learning_rate": 1.5160350854840715e-05,
      "loss": 0.0018,
      "num_tokens": 78019817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1185
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3953333333333333,
      "grad_norm": 1.274765253356236e-07,
      "kl": 0.04339599609375,
      "learning_rate": 1.5150380749100545e-05,
      "loss": 0.0017,
      "num_tokens": 78095721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1186
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.39566666666666667,
      "grad_norm": 3.0003371875864104e-07,
      "kl": 0.0465087890625,
      "learning_rate": 1.5140403670491406e-05,
      "loss": 0.0019,
      "num_tokens": 78171657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1187
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.396,
      "grad_norm": 2.5176740336974035e-07,
      "kl": 0.04742431640625,
      "learning_rate": 1.5130419632520814e-05,
      "loss": 0.0019,
      "num_tokens": 78247065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1188
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3963333333333333,
      "grad_norm": 2.1937216843070928e-07,
      "kl": 0.04278564453125,
      "learning_rate": 1.5120428648705716e-05,
      "loss": 0.0017,
      "num_tokens": 78322809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1189
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.39666666666666667,
      "grad_norm": 2.1508839154193993e-07,
      "kl": 0.0479736328125,
      "learning_rate": 1.5110430732572454e-05,
      "loss": 0.0019,
      "num_tokens": 78397609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1190
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.397,
      "grad_norm": 8.586322763903809e-08,
      "kl": 0.04803466796875,
      "learning_rate": 1.5100425897656754e-05,
      "loss": 0.0019,
      "num_tokens": 78471801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1191
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3973333333333333,
      "grad_norm": 1.683656876139139e-07,
      "kl": 0.04522705078125,
      "learning_rate": 1.5090414157503715e-05,
      "loss": 0.0018,
      "num_tokens": 78547721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1192
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.39766666666666667,
      "grad_norm": 1.384997005970945e-07,
      "kl": 0.04327392578125,
      "learning_rate": 1.508039552566778e-05,
      "loss": 0.0017,
      "num_tokens": 78622745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1193
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.398,
      "grad_norm": 2.0212321771850839e-07,
      "kl": 0.04644775390625,
      "learning_rate": 1.5070370015712727e-05,
      "loss": 0.0019,
      "num_tokens": 78698265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1194
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3983333333333333,
      "grad_norm": 1.7368222415825585e-07,
      "kl": 0.0447998046875,
      "learning_rate": 1.5060337641211637e-05,
      "loss": 0.0018,
      "num_tokens": 78776089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1195
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.39866666666666667,
      "grad_norm": 1.2539558724711242e-07,
      "kl": 0.04925537109375,
      "learning_rate": 1.5050298415746903e-05,
      "loss": 0.002,
      "num_tokens": 78851273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1196
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.399,
      "grad_norm": 2.0768104036505974e-07,
      "kl": 0.0452880859375,
      "learning_rate": 1.5040252352910168e-05,
      "loss": 0.0018,
      "num_tokens": 78929033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1197
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.3993333333333333,
      "grad_norm": 1.5647692919174006e-07,
      "kl": 0.04742431640625,
      "learning_rate": 1.5030199466302354e-05,
      "loss": 0.0019,
      "num_tokens": 79003129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1198
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.39966666666666667,
      "grad_norm": 1.1520349829652332e-07,
      "kl": 0.043212890625,
      "learning_rate": 1.5020139769533604e-05,
      "loss": 0.0017,
      "num_tokens": 79076185.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1199
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4,
      "grad_norm": 1.4021510708062124e-07,
      "kl": 0.0445556640625,
      "learning_rate": 1.5010073276223295e-05,
      "loss": 0.0018,
      "num_tokens": 79149657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1200
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4003333333333333,
      "grad_norm": 1.8614881014400453e-07,
      "kl": 0.04443359375,
      "learning_rate": 1.5000000000000002e-05,
      "loss": 0.0018,
      "num_tokens": 79225913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1201
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.40066666666666667,
      "grad_norm": 1.2410062311118963e-07,
      "kl": 0.04364013671875,
      "learning_rate": 1.4989919954501474e-05,
      "loss": 0.0017,
      "num_tokens": 79300089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1202
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.401,
      "grad_norm": 2.0684252888258925e-07,
      "kl": 0.04833984375,
      "learning_rate": 1.4979833153374636e-05,
      "loss": 0.0019,
      "num_tokens": 79377321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1203
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4013333333333333,
      "grad_norm": 1.9826116215426737e-07,
      "kl": 0.0465087890625,
      "learning_rate": 1.4969739610275556e-05,
      "loss": 0.0019,
      "num_tokens": 79452633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1204
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.40166666666666667,
      "grad_norm": 1.449653552754171e-07,
      "kl": 0.048583984375,
      "learning_rate": 1.4959639338869423e-05,
      "loss": 0.0019,
      "num_tokens": 79526681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1205
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.402,
      "grad_norm": 1.6206243458327663e-07,
      "kl": 0.04620361328125,
      "learning_rate": 1.4949532352830543e-05,
      "loss": 0.0018,
      "num_tokens": 79601065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1206
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4023333333333333,
      "grad_norm": 2.899886055729439e-07,
      "kl": 0.050048828125,
      "learning_rate": 1.493941866584231e-05,
      "loss": 0.002,
      "num_tokens": 79676745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1207
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4026666666666667,
      "grad_norm": 2.1126996330167458e-07,
      "kl": 0.0498046875,
      "learning_rate": 1.4929298291597195e-05,
      "loss": 0.002,
      "num_tokens": 79751977.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1208
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.403,
      "grad_norm": 2.8342509494905244e-07,
      "kl": 0.04742431640625,
      "learning_rate": 1.4919171243796706e-05,
      "loss": 0.0019,
      "num_tokens": 79827577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1209
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4033333333333333,
      "grad_norm": 1.6033871474974148e-07,
      "kl": 0.04632568359375,
      "learning_rate": 1.490903753615141e-05,
      "loss": 0.0019,
      "num_tokens": 79902825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1210
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4036666666666667,
      "grad_norm": 2.413310653537337e-07,
      "kl": 0.0455322265625,
      "learning_rate": 1.4898897182380872e-05,
      "loss": 0.0018,
      "num_tokens": 79980281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1211
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.404,
      "grad_norm": 1.9254416372405103e-07,
      "kl": 0.04681396484375,
      "learning_rate": 1.4888750196213661e-05,
      "loss": 0.0019,
      "num_tokens": 80055401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1212
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4043333333333333,
      "grad_norm": 2.6125357521777914e-07,
      "kl": 0.0460205078125,
      "learning_rate": 1.4878596591387329e-05,
      "loss": 0.0018,
      "num_tokens": 80136905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1213
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4046666666666667,
      "grad_norm": 2.127672473761777e-07,
      "kl": 0.04913330078125,
      "learning_rate": 1.486843638164838e-05,
      "loss": 0.002,
      "num_tokens": 80220537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1214
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.405,
      "grad_norm": 1.314412116926178e-07,
      "kl": 0.04498291015625,
      "learning_rate": 1.4858269580752272e-05,
      "loss": 0.0018,
      "num_tokens": 80295337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1215
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4053333333333333,
      "grad_norm": 1.46661093936018e-07,
      "kl": 0.041748046875,
      "learning_rate": 1.4848096202463373e-05,
      "loss": 0.0017,
      "num_tokens": 80375017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1216
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4056666666666667,
      "grad_norm": 1.2819751304959937e-07,
      "kl": 0.04815673828125,
      "learning_rate": 1.4837916260554966e-05,
      "loss": 0.0019,
      "num_tokens": 80453433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1217
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.406,
      "grad_norm": 1.4237167533792672e-07,
      "kl": 0.04864501953125,
      "learning_rate": 1.4827729768809215e-05,
      "loss": 0.0019,
      "num_tokens": 80528857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1218
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4063333333333333,
      "grad_norm": 1.5206752834728832e-07,
      "kl": 0.04400634765625,
      "learning_rate": 1.4817536741017153e-05,
      "loss": 0.0018,
      "num_tokens": 80603961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1219
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4066666666666667,
      "grad_norm": 2.4158873657142976e-07,
      "kl": 0.044189453125,
      "learning_rate": 1.4807337190978666e-05,
      "loss": 0.0018,
      "num_tokens": 80683657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1220
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.407,
      "grad_norm": 2.102984524299245e-07,
      "kl": 0.044189453125,
      "learning_rate": 1.4797131132502464e-05,
      "loss": 0.0018,
      "num_tokens": 80758361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1221
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4073333333333333,
      "grad_norm": 1.6987705464543978e-07,
      "kl": 0.040771484375,
      "learning_rate": 1.478691857940607e-05,
      "loss": 0.0016,
      "num_tokens": 80834777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1222
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4076666666666667,
      "grad_norm": 1.2850320274537808e-07,
      "kl": 0.04522705078125,
      "learning_rate": 1.47766995455158e-05,
      "loss": 0.0018,
      "num_tokens": 80908665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1223
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.408,
      "grad_norm": 2.0308661419221608e-07,
      "kl": 0.044677734375,
      "learning_rate": 1.4766474044666748e-05,
      "loss": 0.0018,
      "num_tokens": 80983545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1224
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4083333333333333,
      "grad_norm": 1.6899363686206925e-07,
      "kl": 0.05181884765625,
      "learning_rate": 1.4756242090702756e-05,
      "loss": 0.0021,
      "num_tokens": 81062521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1225
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4086666666666667,
      "grad_norm": 1.781420309043824e-07,
      "kl": 0.048095703125,
      "learning_rate": 1.4746003697476406e-05,
      "loss": 0.0019,
      "num_tokens": 81140249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1226
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.409,
      "grad_norm": 1.2896579448806733e-07,
      "kl": 0.04437255859375,
      "learning_rate": 1.4735758878849e-05,
      "loss": 0.0018,
      "num_tokens": 81218105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1227
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4093333333333333,
      "grad_norm": 1.3127252884714835e-07,
      "kl": 0.046142578125,
      "learning_rate": 1.4725507648690542e-05,
      "loss": 0.0018,
      "num_tokens": 81292457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1228
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4096666666666667,
      "grad_norm": 2.1874187439152593e-07,
      "kl": 0.04534912109375,
      "learning_rate": 1.4715250020879705e-05,
      "loss": 0.0018,
      "num_tokens": 81366745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1229
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.41,
      "grad_norm": 1.505979128069157e-07,
      "kl": 0.04302978515625,
      "learning_rate": 1.4704986009303833e-05,
      "loss": 0.0017,
      "num_tokens": 81442249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1230
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4103333333333333,
      "grad_norm": 2.496243212135596e-07,
      "kl": 0.046875,
      "learning_rate": 1.469471562785891e-05,
      "loss": 0.0019,
      "num_tokens": 81520809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1231
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4106666666666667,
      "grad_norm": 0.00035431934520602226,
      "kl": 0.04376220703125,
      "learning_rate": 1.4684438890449542e-05,
      "loss": 0.0018,
      "num_tokens": 81595593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1232
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.411,
      "grad_norm": 1.585467970244281e-07,
      "kl": 0.04498291015625,
      "learning_rate": 1.4674155810988944e-05,
      "loss": 0.0018,
      "num_tokens": 81671913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1233
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.41133333333333333,
      "grad_norm": 1.5489291627091006e-07,
      "kl": 0.04461669921875,
      "learning_rate": 1.4663866403398915e-05,
      "loss": 0.0018,
      "num_tokens": 81746793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1234
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4116666666666667,
      "grad_norm": 2.075672256296457e-07,
      "kl": 0.04388427734375,
      "learning_rate": 1.4653570681609816e-05,
      "loss": 0.0018,
      "num_tokens": 81822601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1235
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.412,
      "grad_norm": 1.7507488792034565e-07,
      "kl": 0.0443115234375,
      "learning_rate": 1.4643268659560571e-05,
      "loss": 0.0018,
      "num_tokens": 81904633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1236
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.41233333333333333,
      "grad_norm": 1.3837284029705188e-07,
      "kl": 0.04388427734375,
      "learning_rate": 1.463296035119862e-05,
      "loss": 0.0018,
      "num_tokens": 81980345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1237
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4126666666666667,
      "grad_norm": 1.48684009104727e-07,
      "kl": 0.04254150390625,
      "learning_rate": 1.4622645770479915e-05,
      "loss": 0.0017,
      "num_tokens": 82056985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1238
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.413,
      "grad_norm": 1.086328538235648e-07,
      "kl": 0.041748046875,
      "learning_rate": 1.4612324931368909e-05,
      "loss": 0.0017,
      "num_tokens": 82130793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1239
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.41333333333333333,
      "grad_norm": 2.029710941542362e-07,
      "kl": 0.04400634765625,
      "learning_rate": 1.4601997847838518e-05,
      "loss": 0.0018,
      "num_tokens": 82207657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1240
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4136666666666667,
      "grad_norm": 1.904505495531339e-07,
      "kl": 0.04705810546875,
      "learning_rate": 1.4591664533870118e-05,
      "loss": 0.0019,
      "num_tokens": 82282425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1241
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.414,
      "grad_norm": 1.70188158676865e-07,
      "kl": 0.04486083984375,
      "learning_rate": 1.458132500345352e-05,
      "loss": 0.0018,
      "num_tokens": 82358569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1242
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.41433333333333333,
      "grad_norm": 1.702246237300642e-07,
      "kl": 0.04901123046875,
      "learning_rate": 1.4570979270586944e-05,
      "loss": 0.002,
      "num_tokens": 82434393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1243
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4146666666666667,
      "grad_norm": 1.6712797901163867e-07,
      "kl": 0.04791259765625,
      "learning_rate": 1.4560627349277017e-05,
      "loss": 0.0019,
      "num_tokens": 82510425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1244
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.415,
      "grad_norm": 1.8289907188773213e-07,
      "kl": 0.04327392578125,
      "learning_rate": 1.4550269253538739e-05,
      "loss": 0.0017,
      "num_tokens": 82584985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1245
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.41533333333333333,
      "grad_norm": 2.3691565331773745e-07,
      "kl": 0.045166015625,
      "learning_rate": 1.4539904997395468e-05,
      "loss": 0.0018,
      "num_tokens": 82660153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1246
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4156666666666667,
      "grad_norm": 2.854472143098974e-07,
      "kl": 0.04815673828125,
      "learning_rate": 1.452953459487891e-05,
      "loss": 0.0019,
      "num_tokens": 82737913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1247
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.416,
      "grad_norm": 1.0733805311247124e-07,
      "kl": 0.04669189453125,
      "learning_rate": 1.4519158060029081e-05,
      "loss": 0.0019,
      "num_tokens": 82812345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1248
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.41633333333333333,
      "grad_norm": 1.1579354719515322e-07,
      "kl": 0.04541015625,
      "learning_rate": 1.4508775406894308e-05,
      "loss": 0.0018,
      "num_tokens": 82889769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1249
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4166666666666667,
      "grad_norm": 9.00773287071388e-08,
      "kl": 0.04547119140625,
      "learning_rate": 1.4498386649531198e-05,
      "loss": 0.0018,
      "num_tokens": 82964649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1250
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.417,
      "grad_norm": 1.5067870151597162e-07,
      "kl": 0.04046630859375,
      "learning_rate": 1.4487991802004625e-05,
      "loss": 0.0016,
      "num_tokens": 83039529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1251
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.41733333333333333,
      "grad_norm": 1.863725742623501e-07,
      "kl": 0.0452880859375,
      "learning_rate": 1.4477590878387697e-05,
      "loss": 0.0018,
      "num_tokens": 83115241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1252
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4176666666666667,
      "grad_norm": 1.9447332988420385e-07,
      "kl": 0.045654296875,
      "learning_rate": 1.4467183892761769e-05,
      "loss": 0.0018,
      "num_tokens": 83191865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1253
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.418,
      "grad_norm": 1.9787889016242843e-07,
      "kl": 0.04180908203125,
      "learning_rate": 1.4456770859216383e-05,
      "loss": 0.0017,
      "num_tokens": 83270009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1254
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.41833333333333333,
      "grad_norm": 2.1966877739032498e-07,
      "kl": 0.0504150390625,
      "learning_rate": 1.4446351791849276e-05,
      "loss": 0.002,
      "num_tokens": 83348457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1255
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4186666666666667,
      "grad_norm": 1.6139181013841153e-07,
      "kl": 0.04248046875,
      "learning_rate": 1.4435926704766364e-05,
      "loss": 0.0017,
      "num_tokens": 83423993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1256
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.419,
      "grad_norm": 2.2698210955240938e-07,
      "kl": 0.04669189453125,
      "learning_rate": 1.442549561208169e-05,
      "loss": 0.0019,
      "num_tokens": 83499577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1257
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.41933333333333334,
      "grad_norm": 2.0261242639207921e-07,
      "kl": 0.04986572265625,
      "learning_rate": 1.4415058527917454e-05,
      "loss": 0.002,
      "num_tokens": 83575897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1258
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4196666666666667,
      "grad_norm": 9.265646383482817e-08,
      "kl": 0.04656982421875,
      "learning_rate": 1.4404615466403951e-05,
      "loss": 0.0019,
      "num_tokens": 83651193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1259
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.42,
      "grad_norm": 1.5757623827994394e-07,
      "kl": 0.04547119140625,
      "learning_rate": 1.439416644167957e-05,
      "loss": 0.0018,
      "num_tokens": 83727785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1260
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.42033333333333334,
      "grad_norm": 1.3037285384598363e-07,
      "kl": 0.04705810546875,
      "learning_rate": 1.4383711467890776e-05,
      "loss": 0.0019,
      "num_tokens": 83801673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1261
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4206666666666667,
      "grad_norm": 8.008100849110633e-08,
      "kl": 0.04443359375,
      "learning_rate": 1.4373250559192088e-05,
      "loss": 0.0018,
      "num_tokens": 83876169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1262
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.421,
      "grad_norm": 1.897480501611426e-07,
      "kl": 0.04693603515625,
      "learning_rate": 1.4362783729746068e-05,
      "loss": 0.0019,
      "num_tokens": 83951737.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1263
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.42133333333333334,
      "grad_norm": 9.561996705542697e-08,
      "kl": 0.04522705078125,
      "learning_rate": 1.4352310993723277e-05,
      "loss": 0.0018,
      "num_tokens": 84030009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1264
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4216666666666667,
      "grad_norm": 1.865805643319618e-07,
      "kl": 0.04736328125,
      "learning_rate": 1.4341832365302282e-05,
      "loss": 0.0019,
      "num_tokens": 84108601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1265
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.422,
      "grad_norm": 1.5509465356444707e-07,
      "kl": 0.04541015625,
      "learning_rate": 1.4331347858669631e-05,
      "loss": 0.0018,
      "num_tokens": 84183753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1266
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.42233333333333334,
      "grad_norm": 1.7682758368664508e-07,
      "kl": 0.0458984375,
      "learning_rate": 1.4320857488019826e-05,
      "loss": 0.0018,
      "num_tokens": 84260745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1267
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4226666666666667,
      "grad_norm": 1.6690631809979095e-07,
      "kl": 0.04571533203125,
      "learning_rate": 1.4310361267555302e-05,
      "loss": 0.0018,
      "num_tokens": 84337145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1268
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.423,
      "grad_norm": 1.7728550005813304e-07,
      "kl": 0.04742431640625,
      "learning_rate": 1.4299859211486429e-05,
      "loss": 0.0019,
      "num_tokens": 84412489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1269
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.42333333333333334,
      "grad_norm": 1.2475776145493e-07,
      "kl": 0.0467529296875,
      "learning_rate": 1.4289351334031461e-05,
      "loss": 0.0019,
      "num_tokens": 84486201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1270
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4236666666666667,
      "grad_norm": 1.92818504274328e-07,
      "kl": 0.04327392578125,
      "learning_rate": 1.4278837649416543e-05,
      "loss": 0.0017,
      "num_tokens": 84562777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1271
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.424,
      "grad_norm": 1.1781333597582488e-07,
      "kl": 0.04620361328125,
      "learning_rate": 1.4268318171875683e-05,
      "loss": 0.0018,
      "num_tokens": 84637529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1272
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.42433333333333334,
      "grad_norm": 1.0294044727743312e-07,
      "kl": 0.042236328125,
      "learning_rate": 1.4257792915650728e-05,
      "loss": 0.0017,
      "num_tokens": 84715337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1273
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4246666666666667,
      "grad_norm": 1.5438126865774393e-07,
      "kl": 0.0435791015625,
      "learning_rate": 1.4247261894991344e-05,
      "loss": 0.0017,
      "num_tokens": 84791577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1274
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.425,
      "grad_norm": 1.4589319619062735e-07,
      "kl": 0.0506591796875,
      "learning_rate": 1.4236725124155015e-05,
      "loss": 0.002,
      "num_tokens": 84872169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1275
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.42533333333333334,
      "grad_norm": 2.2272581645665923e-07,
      "kl": 0.0458984375,
      "learning_rate": 1.4226182617406996e-05,
      "loss": 0.0018,
      "num_tokens": 84950089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1276
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4256666666666667,
      "grad_norm": 1.4727811503689736e-07,
      "kl": 0.04754638671875,
      "learning_rate": 1.4215634389020314e-05,
      "loss": 0.0019,
      "num_tokens": 85026921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1277
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.426,
      "grad_norm": 1.4976529882915202e-07,
      "kl": 0.04833984375,
      "learning_rate": 1.4205080453275739e-05,
      "loss": 0.0019,
      "num_tokens": 85103625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1278
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.42633333333333334,
      "grad_norm": 1.0492107094250969e-07,
      "kl": 0.047607421875,
      "learning_rate": 1.4194520824461773e-05,
      "loss": 0.0019,
      "num_tokens": 85176809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1279
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4266666666666667,
      "grad_norm": 1.578942203650513e-07,
      "kl": 0.04119873046875,
      "learning_rate": 1.4183955516874624e-05,
      "loss": 0.0016,
      "num_tokens": 85254489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1280
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.427,
      "grad_norm": 1.8067058249471302e-07,
      "kl": 0.04754638671875,
      "learning_rate": 1.417338454481818e-05,
      "loss": 0.0019,
      "num_tokens": 85329529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1281
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.42733333333333334,
      "grad_norm": 1.1183752945953529e-07,
      "kl": 0.04620361328125,
      "learning_rate": 1.4162807922604014e-05,
      "loss": 0.0018,
      "num_tokens": 85404089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1282
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.42766666666666664,
      "grad_norm": 1.6645712719309813e-07,
      "kl": 0.04473876953125,
      "learning_rate": 1.4152225664551333e-05,
      "loss": 0.0018,
      "num_tokens": 85479001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1283
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.428,
      "grad_norm": 1.1262783772281182e-07,
      "kl": 0.04620361328125,
      "learning_rate": 1.4141637784986984e-05,
      "loss": 0.0019,
      "num_tokens": 85553657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1284
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.42833333333333334,
      "grad_norm": 1.2058814036208787e-07,
      "kl": 0.0472412109375,
      "learning_rate": 1.413104429824542e-05,
      "loss": 0.0019,
      "num_tokens": 85628217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1285
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.42866666666666664,
      "grad_norm": 1.3350576466564235e-07,
      "kl": 0.04296875,
      "learning_rate": 1.4120445218668687e-05,
      "loss": 0.0017,
      "num_tokens": 85703209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1286
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.429,
      "grad_norm": 1.0737957722994906e-07,
      "kl": 0.05096435546875,
      "learning_rate": 1.4109840560606397e-05,
      "loss": 0.002,
      "num_tokens": 85778473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1287
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.42933333333333334,
      "grad_norm": 1.1782927344938798e-07,
      "kl": 0.04388427734375,
      "learning_rate": 1.4099230338415728e-05,
      "loss": 0.0018,
      "num_tokens": 85852473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1288
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.42966666666666664,
      "grad_norm": 9.832595537773159e-08,
      "kl": 0.04364013671875,
      "learning_rate": 1.408861456646138e-05,
      "loss": 0.0017,
      "num_tokens": 85925849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1289
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43,
      "grad_norm": 1.0591865873266215e-07,
      "kl": 0.04486083984375,
      "learning_rate": 1.4077993259115568e-05,
      "loss": 0.0018,
      "num_tokens": 86000713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1290
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43033333333333335,
      "grad_norm": 2.2875700267377397e-07,
      "kl": 0.0421142578125,
      "learning_rate": 1.4067366430758004e-05,
      "loss": 0.0017,
      "num_tokens": 86077049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1291
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43066666666666664,
      "grad_norm": 2.0644961296056863e-07,
      "kl": 0.0439453125,
      "learning_rate": 1.405673409577587e-05,
      "loss": 0.0018,
      "num_tokens": 86154377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1292
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.431,
      "grad_norm": 2.0429293101642543e-07,
      "kl": 0.0460205078125,
      "learning_rate": 1.4046096268563814e-05,
      "loss": 0.0018,
      "num_tokens": 86228985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1293
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43133333333333335,
      "grad_norm": 1.844732508970992e-07,
      "kl": 0.04315185546875,
      "learning_rate": 1.4035452963523903e-05,
      "loss": 0.0017,
      "num_tokens": 86306745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1294
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43166666666666664,
      "grad_norm": 1.8191859396665677e-07,
      "kl": 0.047119140625,
      "learning_rate": 1.402480419506563e-05,
      "loss": 0.0019,
      "num_tokens": 86382777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1295
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.432,
      "grad_norm": 9.757667385201785e-08,
      "kl": 0.044677734375,
      "learning_rate": 1.4014149977605893e-05,
      "loss": 0.0018,
      "num_tokens": 86456633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1296
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43233333333333335,
      "grad_norm": 2.0536421629913093e-07,
      "kl": 0.04278564453125,
      "learning_rate": 1.4003490325568953e-05,
      "loss": 0.0017,
      "num_tokens": 86532601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1297
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43266666666666664,
      "grad_norm": 1157.24365234375,
      "kl": 16.53192138671875,
      "learning_rate": 1.3992825253386428e-05,
      "loss": 0.6595,
      "num_tokens": 86612905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1298
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.433,
      "grad_norm": 7.503676613396237e-08,
      "kl": 0.0423583984375,
      "learning_rate": 1.3982154775497287e-05,
      "loss": 0.0017,
      "num_tokens": 86686489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1299
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43333333333333335,
      "grad_norm": 5.4179398745191065e-08,
      "kl": 0.0374755859375,
      "learning_rate": 1.3971478906347806e-05,
      "loss": 0.0015,
      "num_tokens": 86762089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1300
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43366666666666664,
      "grad_norm": 4.304170531099771e-08,
      "kl": 0.044921875,
      "learning_rate": 1.396079766039157e-05,
      "loss": 0.0018,
      "num_tokens": 86839145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1301
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.434,
      "grad_norm": 3.2212387424124245e-08,
      "kl": 0.04461669921875,
      "learning_rate": 1.3950111052089432e-05,
      "loss": 0.0018,
      "num_tokens": 86912729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1302
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43433333333333335,
      "grad_norm": 1.4443187978940841e-08,
      "kl": 0.04534912109375,
      "learning_rate": 1.3939419095909513e-05,
      "loss": 0.0018,
      "num_tokens": 86986969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1303
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43466666666666665,
      "grad_norm": 1.2370498403413421e-08,
      "kl": 0.0509033203125,
      "learning_rate": 1.3928721806327173e-05,
      "loss": 0.002,
      "num_tokens": 87060809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1304
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.435,
      "grad_norm": 2.5710283679813983e-08,
      "kl": 0.0479736328125,
      "learning_rate": 1.3918019197824985e-05,
      "loss": 0.0019,
      "num_tokens": 87136425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1305
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43533333333333335,
      "grad_norm": 1.757689993553413e-08,
      "kl": 0.04754638671875,
      "learning_rate": 1.3907311284892737e-05,
      "loss": 0.0019,
      "num_tokens": 87213049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1306
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43566666666666665,
      "grad_norm": 5.390435209307043e-09,
      "kl": 0.0445556640625,
      "learning_rate": 1.389659808202739e-05,
      "loss": 0.0018,
      "num_tokens": 87287529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1307
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.436,
      "grad_norm": 8.44595682281124e-09,
      "kl": 0.04620361328125,
      "learning_rate": 1.3885879603733066e-05,
      "loss": 0.0018,
      "num_tokens": 87362745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1308
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43633333333333335,
      "grad_norm": 6.079368120026629e-09,
      "kl": 0.04791259765625,
      "learning_rate": 1.3875155864521031e-05,
      "loss": 0.0019,
      "num_tokens": 87443769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1309
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43666666666666665,
      "grad_norm": 9.99597205009195e-09,
      "kl": 0.04534912109375,
      "learning_rate": 1.3864426878909674e-05,
      "loss": 0.0018,
      "num_tokens": 87520905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1310
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.437,
      "grad_norm": 5.980236306157849e-09,
      "kl": 0.04656982421875,
      "learning_rate": 1.3853692661424485e-05,
      "loss": 0.0019,
      "num_tokens": 87594425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1311
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43733333333333335,
      "grad_norm": 4.2112660025850346e-09,
      "kl": 0.04803466796875,
      "learning_rate": 1.3842953226598036e-05,
      "loss": 0.0019,
      "num_tokens": 87669177.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1312
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43766666666666665,
      "grad_norm": 3.886565291821853e-09,
      "kl": 0.04547119140625,
      "learning_rate": 1.3832208588969975e-05,
      "loss": 0.0018,
      "num_tokens": 87743449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1313
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.438,
      "grad_norm": 2.714927394009692e-09,
      "kl": 0.04595947265625,
      "learning_rate": 1.3821458763086973e-05,
      "loss": 0.0018,
      "num_tokens": 87818073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1314
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43833333333333335,
      "grad_norm": 2.8266256002495993e-09,
      "kl": 0.04498291015625,
      "learning_rate": 1.3810703763502744e-05,
      "loss": 0.0018,
      "num_tokens": 87892873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1315
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43866666666666665,
      "grad_norm": 2.116833153209541e-09,
      "kl": 0.04107666015625,
      "learning_rate": 1.3799943604777993e-05,
      "loss": 0.0016,
      "num_tokens": 87968873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1316
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.439,
      "grad_norm": 3.138934001611915e-09,
      "kl": 0.04656982421875,
      "learning_rate": 1.3789178301480415e-05,
      "loss": 0.0019,
      "num_tokens": 88043081.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1317
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43933333333333335,
      "grad_norm": 3.457683028784686e-09,
      "kl": 0.04510498046875,
      "learning_rate": 1.3778407868184674e-05,
      "loss": 0.0018,
      "num_tokens": 88119225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1318
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.43966666666666665,
      "grad_norm": 2.765699003148825e-09,
      "kl": 0.04766845703125,
      "learning_rate": 1.3767632319472373e-05,
      "loss": 0.0019,
      "num_tokens": 88198617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1319
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44,
      "grad_norm": 2.8450444222727356e-09,
      "kl": 0.0477294921875,
      "learning_rate": 1.375685166993204e-05,
      "loss": 0.0019,
      "num_tokens": 88274873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1320
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44033333333333335,
      "grad_norm": 1.6249828149739187e-09,
      "kl": 0.04498291015625,
      "learning_rate": 1.3746065934159123e-05,
      "loss": 0.0018,
      "num_tokens": 88352441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1321
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44066666666666665,
      "grad_norm": 1.7124592854855791e-09,
      "kl": 0.04473876953125,
      "learning_rate": 1.3735275126755933e-05,
      "loss": 0.0018,
      "num_tokens": 88427785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1322
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.441,
      "grad_norm": 1.8947714508499303e-09,
      "kl": 0.0467529296875,
      "learning_rate": 1.3724479262331662e-05,
      "loss": 0.0019,
      "num_tokens": 88501769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1323
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44133333333333336,
      "grad_norm": 1.7100232341249466e-09,
      "kl": 0.05010986328125,
      "learning_rate": 1.371367835550235e-05,
      "loss": 0.002,
      "num_tokens": 88585097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1324
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44166666666666665,
      "grad_norm": 1.8501377097024374e-09,
      "kl": 0.04571533203125,
      "learning_rate": 1.3702872420890853e-05,
      "loss": 0.0018,
      "num_tokens": 88660857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1325
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.442,
      "grad_norm": 2.5744724130305485e-09,
      "kl": 0.04754638671875,
      "learning_rate": 1.3692061473126845e-05,
      "loss": 0.0019,
      "num_tokens": 88738873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1326
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44233333333333336,
      "grad_norm": 1.300720864350069e-09,
      "kl": 0.04833984375,
      "learning_rate": 1.3681245526846782e-05,
      "loss": 0.0019,
      "num_tokens": 88812921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1327
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44266666666666665,
      "grad_norm": 2.000139165403425e-09,
      "kl": 0.045654296875,
      "learning_rate": 1.3670424596693884e-05,
      "loss": 0.0018,
      "num_tokens": 88888297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1328
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.443,
      "grad_norm": 1.4673513515361947e-09,
      "kl": 0.0445556640625,
      "learning_rate": 1.3659598697318122e-05,
      "loss": 0.0018,
      "num_tokens": 88962489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1329
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44333333333333336,
      "grad_norm": 2.1496429081224733e-09,
      "kl": 0.04644775390625,
      "learning_rate": 1.3648767843376196e-05,
      "loss": 0.0019,
      "num_tokens": 89038297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1330
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44366666666666665,
      "grad_norm": 2.0196293526453246e-09,
      "kl": 0.04058837890625,
      "learning_rate": 1.3637932049531517e-05,
      "loss": 0.0016,
      "num_tokens": 89118265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1331
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.444,
      "grad_norm": 1.716194519829628e-09,
      "kl": 0.04852294921875,
      "learning_rate": 1.3627091330454172e-05,
      "loss": 0.0019,
      "num_tokens": 89193257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1332
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44433333333333336,
      "grad_norm": 1.683356787296475e-09,
      "kl": 0.04669189453125,
      "learning_rate": 1.3616245700820922e-05,
      "loss": 0.0019,
      "num_tokens": 89268697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1333
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44466666666666665,
      "grad_norm": 1.1414635903150838e-09,
      "kl": 0.04693603515625,
      "learning_rate": 1.3605395175315188e-05,
      "loss": 0.0019,
      "num_tokens": 89346873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1334
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.445,
      "grad_norm": 1.3487523320421246e-09,
      "kl": 0.041168212890625,
      "learning_rate": 1.3594539768627e-05,
      "loss": 0.0016,
      "num_tokens": 89424377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1335
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44533333333333336,
      "grad_norm": 1.5093900573859287e-09,
      "kl": 0.04534912109375,
      "learning_rate": 1.3583679495453e-05,
      "loss": 0.0018,
      "num_tokens": 89499353.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1336
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44566666666666666,
      "grad_norm": 1.5548657916752973e-09,
      "kl": 0.04779052734375,
      "learning_rate": 1.3572814370496441e-05,
      "loss": 0.0019,
      "num_tokens": 89576169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1337
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.446,
      "grad_norm": 1.6952255155189278e-09,
      "kl": 0.04583740234375,
      "learning_rate": 1.3561944408467112e-05,
      "loss": 0.0018,
      "num_tokens": 89650089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1338
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44633333333333336,
      "grad_norm": 1.664295923298198e-09,
      "kl": 0.04608154296875,
      "learning_rate": 1.3551069624081372e-05,
      "loss": 0.0018,
      "num_tokens": 89725305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1339
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44666666666666666,
      "grad_norm": 1.7554038223011048e-09,
      "kl": 0.0465087890625,
      "learning_rate": 1.3540190032062102e-05,
      "loss": 0.0019,
      "num_tokens": 89799497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1340
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.447,
      "grad_norm": 1.3865159020909346e-09,
      "kl": 0.04388427734375,
      "learning_rate": 1.3529305647138689e-05,
      "loss": 0.0018,
      "num_tokens": 89873033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1341
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44733333333333336,
      "grad_norm": 1.2171670338290141e-09,
      "kl": 0.04754638671875,
      "learning_rate": 1.3518416484047018e-05,
      "loss": 0.0019,
      "num_tokens": 89946921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1342
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44766666666666666,
      "grad_norm": 1.6039399808320809e-09,
      "kl": 0.04608154296875,
      "learning_rate": 1.3507522557529438e-05,
      "loss": 0.0018,
      "num_tokens": 90021705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1343
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.448,
      "grad_norm": 1.9966848174846064e-09,
      "kl": 0.0465087890625,
      "learning_rate": 1.3496623882334738e-05,
      "loss": 0.0019,
      "num_tokens": 90098553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1344
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4483333333333333,
      "grad_norm": 1.5451780965847206e-09,
      "kl": 0.04833984375,
      "learning_rate": 1.3485720473218153e-05,
      "loss": 0.0019,
      "num_tokens": 90172921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1345
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44866666666666666,
      "grad_norm": 1.5867720470907898e-09,
      "kl": 0.04302978515625,
      "learning_rate": 1.3474812344941315e-05,
      "loss": 0.0017,
      "num_tokens": 90250441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1346
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.449,
      "grad_norm": 9.50118317355475e-10,
      "kl": 0.04559326171875,
      "learning_rate": 1.3463899512272249e-05,
      "loss": 0.0018,
      "num_tokens": 90324649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1347
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4493333333333333,
      "grad_norm": 1.4622596467006588e-09,
      "kl": 0.04290771484375,
      "learning_rate": 1.3452981989985347e-05,
      "loss": 0.0017,
      "num_tokens": 90399945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1348
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.44966666666666666,
      "grad_norm": 1.6429176907806209e-09,
      "kl": 0.04595947265625,
      "learning_rate": 1.3442059792861356e-05,
      "loss": 0.0018,
      "num_tokens": 90474921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1349
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.45,
      "grad_norm": 1.171096331020749e-09,
      "kl": 0.047607421875,
      "learning_rate": 1.343113293568734e-05,
      "loss": 0.0019,
      "num_tokens": 90549945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1350
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4503333333333333,
      "grad_norm": 2.0986277160517375e-09,
      "kl": 0.039306640625,
      "learning_rate": 1.342020143325669e-05,
      "loss": 0.0016,
      "num_tokens": 90628921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1351
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.45066666666666666,
      "grad_norm": 3.771317036438404e-09,
      "kl": 0.04345703125,
      "learning_rate": 1.3409265300369065e-05,
      "loss": 0.0017,
      "num_tokens": 90706633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1352
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.451,
      "grad_norm": 2.9408953050591435e-09,
      "kl": 0.04150390625,
      "learning_rate": 1.3398324551830416e-05,
      "loss": 0.0017,
      "num_tokens": 90784633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1353
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4513333333333333,
      "grad_norm": 1.4011295457194706e-09,
      "kl": 0.04315185546875,
      "learning_rate": 1.3387379202452917e-05,
      "loss": 0.0017,
      "num_tokens": 90862345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1354
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.45166666666666666,
      "grad_norm": 1.1908668495763663e-09,
      "kl": 0.04931640625,
      "learning_rate": 1.3376429267054991e-05,
      "loss": 0.002,
      "num_tokens": 90938377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1355
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.452,
      "grad_norm": 2.4293096423377847e-09,
      "kl": 0.04376220703125,
      "learning_rate": 1.3365474760461265e-05,
      "loss": 0.0018,
      "num_tokens": 91014329.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1356
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4523333333333333,
      "grad_norm": 2.149812106111426e-09,
      "kl": 0.048095703125,
      "learning_rate": 1.3354515697502552e-05,
      "loss": 0.0019,
      "num_tokens": 91090825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1357
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.45266666666666666,
      "grad_norm": 1.3354113370667164e-09,
      "kl": 0.04547119140625,
      "learning_rate": 1.3343552093015833e-05,
      "loss": 0.0018,
      "num_tokens": 91165225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1358
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.453,
      "grad_norm": 1.2926862913431592e-09,
      "kl": 0.045654296875,
      "learning_rate": 1.3332583961844243e-05,
      "loss": 0.0018,
      "num_tokens": 91239993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1359
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4533333333333333,
      "grad_norm": 1.5499076466696238e-09,
      "kl": 0.04150390625,
      "learning_rate": 1.3321611318837033e-05,
      "loss": 0.0017,
      "num_tokens": 91319433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1360
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.45366666666666666,
      "grad_norm": 2.1318795617730757e-09,
      "kl": 0.04510498046875,
      "learning_rate": 1.3310634178849583e-05,
      "loss": 0.0018,
      "num_tokens": 91394857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1361
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.454,
      "grad_norm": 1.2255058079446712e-09,
      "kl": 0.04852294921875,
      "learning_rate": 1.3299652556743341e-05,
      "loss": 0.0019,
      "num_tokens": 91470153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1362
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4543333333333333,
      "grad_norm": 1.1797572918581523e-09,
      "kl": 0.04364013671875,
      "learning_rate": 1.3288666467385834e-05,
      "loss": 0.0017,
      "num_tokens": 91544697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1363
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.45466666666666666,
      "grad_norm": 2.0274464329617103e-09,
      "kl": 0.0445556640625,
      "learning_rate": 1.3277675925650635e-05,
      "loss": 0.0018,
      "num_tokens": 91621577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1364
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.455,
      "grad_norm": 1.3492394979053302e-09,
      "kl": 0.045654296875,
      "learning_rate": 1.3266680946417346e-05,
      "loss": 0.0018,
      "num_tokens": 91696281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1365
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4553333333333333,
      "grad_norm": 1.99545913126542e-09,
      "kl": 0.04644775390625,
      "learning_rate": 1.3255681544571568e-05,
      "loss": 0.0019,
      "num_tokens": 91772937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1366
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.45566666666666666,
      "grad_norm": 1.376682767784132e-09,
      "kl": 0.04632568359375,
      "learning_rate": 1.3244677735004904e-05,
      "loss": 0.0019,
      "num_tokens": 91846793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1367
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.456,
      "grad_norm": 1.3739618331953807e-09,
      "kl": 0.04718017578125,
      "learning_rate": 1.3233669532614914e-05,
      "loss": 0.0019,
      "num_tokens": 91921753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1368
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4563333333333333,
      "grad_norm": 1.5161204514058113e-09,
      "kl": 0.048828125,
      "learning_rate": 1.3222656952305113e-05,
      "loss": 0.0019,
      "num_tokens": 91997033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1369
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.45666666666666667,
      "grad_norm": 1.7136055907585046e-09,
      "kl": 0.044189453125,
      "learning_rate": 1.3211640008984934e-05,
      "loss": 0.0018,
      "num_tokens": 92072361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1370
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.457,
      "grad_norm": 1.2126788462296645e-09,
      "kl": 0.04656982421875,
      "learning_rate": 1.3200618717569716e-05,
      "loss": 0.0019,
      "num_tokens": 92147673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1371
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4573333333333333,
      "grad_norm": 1.926088177839347e-09,
      "kl": 0.04302978515625,
      "learning_rate": 1.3189593092980701e-05,
      "loss": 0.0017,
      "num_tokens": 92223433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1372
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.45766666666666667,
      "grad_norm": 1.3767078588244885e-09,
      "kl": 0.04730224609375,
      "learning_rate": 1.317856315014498e-05,
      "loss": 0.0019,
      "num_tokens": 92298665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1373
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.458,
      "grad_norm": 1.9467807366169154e-09,
      "kl": 0.04718017578125,
      "learning_rate": 1.3167528903995497e-05,
      "loss": 0.0019,
      "num_tokens": 92373465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1374
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4583333333333333,
      "grad_norm": 1.3485310645933168e-09,
      "kl": 0.04571533203125,
      "learning_rate": 1.3156490369471026e-05,
      "loss": 0.0018,
      "num_tokens": 92447449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1375
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.45866666666666667,
      "grad_norm": 1.0348388812531084e-09,
      "kl": 0.04608154296875,
      "learning_rate": 1.3145447561516138e-05,
      "loss": 0.0018,
      "num_tokens": 92521497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1376
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.459,
      "grad_norm": 1.3868521886450935e-09,
      "kl": 0.0450439453125,
      "learning_rate": 1.3134400495081197e-05,
      "loss": 0.0018,
      "num_tokens": 92595625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1377
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4593333333333333,
      "grad_norm": 1.4668779524384945e-09,
      "kl": 0.04315185546875,
      "learning_rate": 1.3123349185122328e-05,
      "loss": 0.0017,
      "num_tokens": 92670505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1378
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.45966666666666667,
      "grad_norm": 2.1983377340717425e-09,
      "kl": 0.0445556640625,
      "learning_rate": 1.3112293646601402e-05,
      "loss": 0.0018,
      "num_tokens": 92747241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1379
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.46,
      "grad_norm": 2.091724571329223e-09,
      "kl": 0.048583984375,
      "learning_rate": 1.3101233894486018e-05,
      "loss": 0.0019,
      "num_tokens": 92831817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1380
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4603333333333333,
      "grad_norm": 2.2024604362513855e-09,
      "kl": 0.04669189453125,
      "learning_rate": 1.3090169943749475e-05,
      "loss": 0.0019,
      "num_tokens": 92907321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1381
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.46066666666666667,
      "grad_norm": 1.9012000862517198e-09,
      "kl": 0.0462646484375,
      "learning_rate": 1.307910180937076e-05,
      "loss": 0.0018,
      "num_tokens": 92984617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1382
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.461,
      "grad_norm": 3.703891193751474e-09,
      "kl": 0.046142578125,
      "learning_rate": 1.3068029506334526e-05,
      "loss": 0.0018,
      "num_tokens": 93060905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1383
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4613333333333333,
      "grad_norm": 1.9161636721776176e-09,
      "kl": 0.0465087890625,
      "learning_rate": 1.3056953049631059e-05,
      "loss": 0.0019,
      "num_tokens": 93139593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1384
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.46166666666666667,
      "grad_norm": 2.7307713867941175e-09,
      "kl": 0.0450439453125,
      "learning_rate": 1.3045872454256278e-05,
      "loss": 0.0018,
      "num_tokens": 93214057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1385
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.462,
      "grad_norm": 2.5718513985140135e-09,
      "kl": 0.047607421875,
      "learning_rate": 1.3034787735211708e-05,
      "loss": 0.0019,
      "num_tokens": 93289721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1386
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4623333333333333,
      "grad_norm": 9.667121547707325e-10,
      "kl": 0.04705810546875,
      "learning_rate": 1.3023698907504447e-05,
      "loss": 0.0019,
      "num_tokens": 93366169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1387
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.46266666666666667,
      "grad_norm": 1.2620564593390782e-09,
      "kl": 0.044189453125,
      "learning_rate": 1.301260598614716e-05,
      "loss": 0.0018,
      "num_tokens": 93441017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1388
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.463,
      "grad_norm": 2.6096655947327463e-09,
      "kl": 0.04632568359375,
      "learning_rate": 1.3001508986158057e-05,
      "loss": 0.0019,
      "num_tokens": 93517273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1389
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4633333333333333,
      "grad_norm": 1.2768710533350713e-09,
      "kl": 0.04595947265625,
      "learning_rate": 1.2990407922560869e-05,
      "loss": 0.0018,
      "num_tokens": 93595337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1390
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.46366666666666667,
      "grad_norm": 2.0450832138863007e-09,
      "kl": 0.04803466796875,
      "learning_rate": 1.297930281038482e-05,
      "loss": 0.0019,
      "num_tokens": 93671049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1391
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.464,
      "grad_norm": 2.2590371795416786e-09,
      "kl": 0.0479736328125,
      "learning_rate": 1.2968193664664633e-05,
      "loss": 0.0019,
      "num_tokens": 93747337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1392
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4643333333333333,
      "grad_norm": 4.018368748859302e-09,
      "kl": 0.04583740234375,
      "learning_rate": 1.2957080500440469e-05,
      "loss": 0.0018,
      "num_tokens": 93825273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1393
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4646666666666667,
      "grad_norm": 1.642464053652759e-09,
      "kl": 0.0428466796875,
      "learning_rate": 1.2945963332757949e-05,
      "loss": 0.0017,
      "num_tokens": 93901033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1394
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.465,
      "grad_norm": 2.3243225122371314e-09,
      "kl": 0.0474853515625,
      "learning_rate": 1.2934842176668105e-05,
      "loss": 0.0019,
      "num_tokens": 93976505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1395
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4653333333333333,
      "grad_norm": 1.3201009174679257e-09,
      "kl": 0.0465087890625,
      "learning_rate": 1.2923717047227368e-05,
      "loss": 0.0019,
      "num_tokens": 94051913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1396
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4656666666666667,
      "grad_norm": 1.878713185021752e-09,
      "kl": 0.04595947265625,
      "learning_rate": 1.2912587959497556e-05,
      "loss": 0.0018,
      "num_tokens": 94128025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1397
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.466,
      "grad_norm": 1.6010204273442241e-09,
      "kl": 0.04608154296875,
      "learning_rate": 1.2901454928545834e-05,
      "loss": 0.0018,
      "num_tokens": 94204345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1398
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4663333333333333,
      "grad_norm": 1.8525165845773017e-09,
      "kl": 0.0452880859375,
      "learning_rate": 1.2890317969444716e-05,
      "loss": 0.0018,
      "num_tokens": 94281161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1399
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4666666666666667,
      "grad_norm": 1.9988819488503395e-09,
      "kl": 0.04534912109375,
      "learning_rate": 1.2879177097272033e-05,
      "loss": 0.0018,
      "num_tokens": 94357209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1400
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.467,
      "grad_norm": 2.9500384357561416e-09,
      "kl": 0.04437255859375,
      "learning_rate": 1.2868032327110904e-05,
      "loss": 0.0018,
      "num_tokens": 94436153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1401
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4673333333333333,
      "grad_norm": 1.4637302481190773e-09,
      "kl": 0.0450439453125,
      "learning_rate": 1.2856883674049736e-05,
      "loss": 0.0018,
      "num_tokens": 94515241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1402
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4676666666666667,
      "grad_norm": 2.4397126541231273e-09,
      "kl": 0.04730224609375,
      "learning_rate": 1.2845731153182191e-05,
      "loss": 0.0019,
      "num_tokens": 94590969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1403
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.468,
      "grad_norm": 1.1360166141116679e-09,
      "kl": 0.0435791015625,
      "learning_rate": 1.2834574779607163e-05,
      "loss": 0.0017,
      "num_tokens": 94669017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1404
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4683333333333333,
      "grad_norm": 2.3172164187457156e-09,
      "kl": 0.0452880859375,
      "learning_rate": 1.2823414568428767e-05,
      "loss": 0.0018,
      "num_tokens": 94746665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1405
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4686666666666667,
      "grad_norm": 1.2102564506122349e-09,
      "kl": 0.042724609375,
      "learning_rate": 1.2812250534756307e-05,
      "loss": 0.0017,
      "num_tokens": 94821273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1406
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.469,
      "grad_norm": 2.793179021409742e-09,
      "kl": 0.048095703125,
      "learning_rate": 1.2801082693704272e-05,
      "loss": 0.0019,
      "num_tokens": 94899001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1407
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4693333333333333,
      "grad_norm": 1.588487674730743e-09,
      "kl": 0.04803466796875,
      "learning_rate": 1.2789911060392295e-05,
      "loss": 0.0019,
      "num_tokens": 94973593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1408
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4696666666666667,
      "grad_norm": 1.8864887429970167e-09,
      "kl": 0.04559326171875,
      "learning_rate": 1.277873564994515e-05,
      "loss": 0.0018,
      "num_tokens": 95048153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1409
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.47,
      "grad_norm": 1.6315002682176782e-09,
      "kl": 0.04736328125,
      "learning_rate": 1.2767556477492722e-05,
      "loss": 0.0019,
      "num_tokens": 95121977.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1410
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4703333333333333,
      "grad_norm": 1.7701125010205487e-09,
      "kl": 0.047119140625,
      "learning_rate": 1.2756373558169992e-05,
      "loss": 0.0019,
      "num_tokens": 95196649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1411
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4706666666666667,
      "grad_norm": 1.5310451795258473e-09,
      "kl": 0.04754638671875,
      "learning_rate": 1.274518690711701e-05,
      "loss": 0.0019,
      "num_tokens": 95272137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1412
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.471,
      "grad_norm": 1.2528541537548676e-09,
      "kl": 0.0465087890625,
      "learning_rate": 1.2733996539478883e-05,
      "loss": 0.0019,
      "num_tokens": 95346665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1413
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4713333333333333,
      "grad_norm": 2.6074351566762743e-09,
      "kl": 0.04315185546875,
      "learning_rate": 1.2722802470405744e-05,
      "loss": 0.0017,
      "num_tokens": 95427961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1414
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4716666666666667,
      "grad_norm": 1.394880433380763e-09,
      "kl": 0.04388427734375,
      "learning_rate": 1.271160471505274e-05,
      "loss": 0.0018,
      "num_tokens": 95502329.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1415
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.472,
      "grad_norm": 1.7276177155522987e-09,
      "kl": 0.04766845703125,
      "learning_rate": 1.270040328858001e-05,
      "loss": 0.0019,
      "num_tokens": 95578633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1416
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4723333333333333,
      "grad_norm": 1.752081701944519e-09,
      "kl": 0.04718017578125,
      "learning_rate": 1.2689198206152657e-05,
      "loss": 0.0019,
      "num_tokens": 95653945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1417
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4726666666666667,
      "grad_norm": 1.3293908196487791e-09,
      "kl": 0.04864501953125,
      "learning_rate": 1.2677989482940747e-05,
      "loss": 0.0019,
      "num_tokens": 95728505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1418
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.473,
      "grad_norm": 1.7600793045247087e-09,
      "kl": 0.0416259765625,
      "learning_rate": 1.2666777134119257e-05,
      "loss": 0.0017,
      "num_tokens": 95805289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1419
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.47333333333333333,
      "grad_norm": 1.7750138026073614e-09,
      "kl": 0.0465087890625,
      "learning_rate": 1.265556117486809e-05,
      "loss": 0.0019,
      "num_tokens": 95882201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1420
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4736666666666667,
      "grad_norm": 1.3607460713771502e-09,
      "kl": 0.046142578125,
      "learning_rate": 1.2644341620372025e-05,
      "loss": 0.0018,
      "num_tokens": 95957113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1421
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.474,
      "grad_norm": 1.334010124587337e-09,
      "kl": 0.041748046875,
      "learning_rate": 1.2633118485820713e-05,
      "loss": 0.0017,
      "num_tokens": 96031465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1422
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.47433333333333333,
      "grad_norm": 1.2358801759759785e-09,
      "kl": 0.04290771484375,
      "learning_rate": 1.2621891786408648e-05,
      "loss": 0.0017,
      "num_tokens": 96106201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1423
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4746666666666667,
      "grad_norm": 1.6416017434295327e-09,
      "kl": 0.0478515625,
      "learning_rate": 1.2610661537335163e-05,
      "loss": 0.0019,
      "num_tokens": 96182201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1424
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.475,
      "grad_norm": 2.3159394402227917e-09,
      "kl": 0.0465087890625,
      "learning_rate": 1.2599427753804377e-05,
      "loss": 0.0019,
      "num_tokens": 96266169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1425
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.47533333333333333,
      "grad_norm": 1.1112566422610826e-09,
      "kl": 0.043212890625,
      "learning_rate": 1.2588190451025209e-05,
      "loss": 0.0017,
      "num_tokens": 96339449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1426
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4756666666666667,
      "grad_norm": 1.889778777908191e-09,
      "kl": 0.04736328125,
      "learning_rate": 1.257694964421134e-05,
      "loss": 0.0019,
      "num_tokens": 96415865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1427
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.476,
      "grad_norm": 1.74467429392422e-09,
      "kl": 0.0526123046875,
      "learning_rate": 1.256570534858119e-05,
      "loss": 0.0021,
      "num_tokens": 96493193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1428
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.47633333333333333,
      "grad_norm": 2.509080054835522e-09,
      "kl": 0.04541015625,
      "learning_rate": 1.2554457579357906e-05,
      "loss": 0.0018,
      "num_tokens": 96571673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1429
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4766666666666667,
      "grad_norm": 1.0961703766909636e-09,
      "kl": 0.04876708984375,
      "learning_rate": 1.2543206351769341e-05,
      "loss": 0.002,
      "num_tokens": 96646409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1430
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.477,
      "grad_norm": 1.38895073220624e-09,
      "kl": 0.04443359375,
      "learning_rate": 1.253195168104802e-05,
      "loss": 0.0018,
      "num_tokens": 96719945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1431
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.47733333333333333,
      "grad_norm": 1.2896351764268843e-09,
      "kl": 0.0501708984375,
      "learning_rate": 1.252069358243114e-05,
      "loss": 0.002,
      "num_tokens": 96795097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1432
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4776666666666667,
      "grad_norm": 2.1438515407368186e-09,
      "kl": 0.04339599609375,
      "learning_rate": 1.2509432071160527e-05,
      "loss": 0.0017,
      "num_tokens": 96872585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1433
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.478,
      "grad_norm": 1.4982761697979186e-09,
      "kl": 0.0465087890625,
      "learning_rate": 1.2498167162482649e-05,
      "loss": 0.0019,
      "num_tokens": 96948329.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1434
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.47833333333333333,
      "grad_norm": 1.269591987096419e-09,
      "kl": 0.04742431640625,
      "learning_rate": 1.2486898871648552e-05,
      "loss": 0.0019,
      "num_tokens": 97023017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1435
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4786666666666667,
      "grad_norm": 2.026363521423491e-09,
      "kl": 0.04449462890625,
      "learning_rate": 1.2475627213913861e-05,
      "loss": 0.0018,
      "num_tokens": 97098281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1436
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.479,
      "grad_norm": 8.847265697831119e-10,
      "kl": 0.04180908203125,
      "learning_rate": 1.246435220453878e-05,
      "loss": 0.0017,
      "num_tokens": 97171961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1437
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.47933333333333333,
      "grad_norm": 1.5877311687617635e-09,
      "kl": 0.044921875,
      "learning_rate": 1.2453073858788027e-05,
      "loss": 0.0018,
      "num_tokens": 97247433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1438
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4796666666666667,
      "grad_norm": 2.2539907718055474e-09,
      "kl": 0.04840087890625,
      "learning_rate": 1.2441792191930856e-05,
      "loss": 0.0019,
      "num_tokens": 97323609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1439
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.48,
      "grad_norm": 1.4704915063390445e-09,
      "kl": 0.044189453125,
      "learning_rate": 1.2430507219240997e-05,
      "loss": 0.0018,
      "num_tokens": 97399513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1440
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.48033333333333333,
      "grad_norm": 1.8854446892646592e-09,
      "kl": 0.04400634765625,
      "learning_rate": 1.2419218955996677e-05,
      "loss": 0.0018,
      "num_tokens": 97476793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1441
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4806666666666667,
      "grad_norm": 2.3348922795207727e-09,
      "kl": 0.04443359375,
      "learning_rate": 1.2407927417480567e-05,
      "loss": 0.0018,
      "num_tokens": 97555769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1442
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.481,
      "grad_norm": 1.3160883494123254e-09,
      "kl": 0.047119140625,
      "learning_rate": 1.2396632618979772e-05,
      "loss": 0.0019,
      "num_tokens": 97629913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1443
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.48133333333333334,
      "grad_norm": 1.3900840478697774e-09,
      "kl": 0.04241943359375,
      "learning_rate": 1.238533457578581e-05,
      "loss": 0.0017,
      "num_tokens": 97704793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1444
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4816666666666667,
      "grad_norm": 2.56626431216489e-09,
      "kl": 0.0445556640625,
      "learning_rate": 1.2374033303194597e-05,
      "loss": 0.0018,
      "num_tokens": 97778921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1445
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.482,
      "grad_norm": 1.310583419567024e-09,
      "kl": 0.04327392578125,
      "learning_rate": 1.2362728816506418e-05,
      "loss": 0.0017,
      "num_tokens": 97853273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1446
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.48233333333333334,
      "grad_norm": 1.58570490071952e-09,
      "kl": 0.048583984375,
      "learning_rate": 1.23514211310259e-05,
      "loss": 0.0019,
      "num_tokens": 97929577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1447
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4826666666666667,
      "grad_norm": 2.092530371200496e-09,
      "kl": 0.0517578125,
      "learning_rate": 1.2340110262062024e-05,
      "loss": 0.0021,
      "num_tokens": 98006553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1448
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.483,
      "grad_norm": 1.4428417349776623e-09,
      "kl": 0.044921875,
      "learning_rate": 1.232879622492806e-05,
      "loss": 0.0018,
      "num_tokens": 98081129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1449
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.48333333333333334,
      "grad_norm": 1.5800878383487316e-09,
      "kl": 0.04693603515625,
      "learning_rate": 1.2317479034941572e-05,
      "loss": 0.0019,
      "num_tokens": 98159705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1450
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4836666666666667,
      "grad_norm": 2.071462779085209e-09,
      "kl": 0.04217529296875,
      "learning_rate": 1.2306158707424402e-05,
      "loss": 0.0017,
      "num_tokens": 98238425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1451
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.484,
      "grad_norm": 1.5709021861098904e-09,
      "kl": 0.04278564453125,
      "learning_rate": 1.2294835257702629e-05,
      "loss": 0.0017,
      "num_tokens": 98315433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1452
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.48433333333333334,
      "grad_norm": 1.7471090130172229e-09,
      "kl": 0.04803466796875,
      "learning_rate": 1.2283508701106559e-05,
      "loss": 0.0019,
      "num_tokens": 98390601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1453
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4846666666666667,
      "grad_norm": 1.4333209064076868e-09,
      "kl": 0.04132080078125,
      "learning_rate": 1.2272179052970711e-05,
      "loss": 0.0017,
      "num_tokens": 98464345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1454
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.485,
      "grad_norm": 2.2627255624740883e-09,
      "kl": 0.04541015625,
      "learning_rate": 1.2260846328633786e-05,
      "loss": 0.0018,
      "num_tokens": 98541241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1455
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.48533333333333334,
      "grad_norm": 1.397131299540888e-09,
      "kl": 0.041015625,
      "learning_rate": 1.2249510543438652e-05,
      "loss": 0.0016,
      "num_tokens": 98616025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1456
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4856666666666667,
      "grad_norm": 2.239487706390264e-09,
      "kl": 0.0467529296875,
      "learning_rate": 1.2238171712732316e-05,
      "loss": 0.0019,
      "num_tokens": 98692217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1457
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.486,
      "grad_norm": 1.706297880765817e-09,
      "kl": 0.04638671875,
      "learning_rate": 1.2226829851865911e-05,
      "loss": 0.0019,
      "num_tokens": 98767865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1458
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.48633333333333334,
      "grad_norm": 2.520240238723659e-09,
      "kl": 0.0458984375,
      "learning_rate": 1.2215484976194675e-05,
      "loss": 0.0018,
      "num_tokens": 98844345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1459
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4866666666666667,
      "grad_norm": 1.0638442349275579e-09,
      "kl": 0.04541015625,
      "learning_rate": 1.2204137101077924e-05,
      "loss": 0.0018,
      "num_tokens": 98917753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1460
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.487,
      "grad_norm": 1.5510051021294657e-09,
      "kl": 0.04351806640625,
      "learning_rate": 1.2192786241879033e-05,
      "loss": 0.0017,
      "num_tokens": 98993817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1461
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.48733333333333334,
      "grad_norm": 1.8757706499172855e-09,
      "kl": 0.0430908203125,
      "learning_rate": 1.2181432413965428e-05,
      "loss": 0.0017,
      "num_tokens": 99068521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1462
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4876666666666667,
      "grad_norm": 2.1416064477364216e-09,
      "kl": 0.047607421875,
      "learning_rate": 1.2170075632708538e-05,
      "loss": 0.0019,
      "num_tokens": 99145353.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1463
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.488,
      "grad_norm": 1.7173015232074818e-09,
      "kl": 0.0457763671875,
      "learning_rate": 1.21587159134838e-05,
      "loss": 0.0018,
      "num_tokens": 99220361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1464
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.48833333333333334,
      "grad_norm": 1.9694794683999817e-09,
      "kl": 0.0418701171875,
      "learning_rate": 1.2147353271670634e-05,
      "loss": 0.0017,
      "num_tokens": 99297625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1465
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.4886666666666667,
      "grad_norm": 1.8061607764963128e-09,
      "kl": 0.04156494140625,
      "learning_rate": 1.2135987722652403e-05,
      "loss": 0.0017,
      "num_tokens": 99374985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1466
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.489,
      "grad_norm": 1.2887576561482206e-09,
      "kl": 0.04608154296875,
      "learning_rate": 1.2124619281816413e-05,
      "loss": 0.0018,
      "num_tokens": 99453369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1467
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.48933333333333334,
      "grad_norm": 1.375928815328109e-09,
      "kl": 0.0478515625,
      "learning_rate": 1.211324796455389e-05,
      "loss": 0.0019,
      "num_tokens": 99528729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1468
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.48966666666666664,
      "grad_norm": 2.256154374435937e-09,
      "kl": 0.04486083984375,
      "learning_rate": 1.210187378625994e-05,
      "loss": 0.0018,
      "num_tokens": 99603113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1469
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49,
      "grad_norm": 1.2393501780394445e-09,
      "kl": 0.044189453125,
      "learning_rate": 1.2090496762333565e-05,
      "loss": 0.0018,
      "num_tokens": 99675865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1470
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49033333333333334,
      "grad_norm": 1.9452728317048695e-09,
      "kl": 0.0457763671875,
      "learning_rate": 1.2079116908177592e-05,
      "loss": 0.0018,
      "num_tokens": 99752585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1471
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49066666666666664,
      "grad_norm": 1.5343085690844305e-09,
      "kl": 0.0455322265625,
      "learning_rate": 1.2067734239198707e-05,
      "loss": 0.0018,
      "num_tokens": 99827161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1472
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.491,
      "grad_norm": 1.8314056937640544e-09,
      "kl": 0.04486083984375,
      "learning_rate": 1.2056348770807386e-05,
      "loss": 0.0018,
      "num_tokens": 99903065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1473
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49133333333333334,
      "grad_norm": 2.139466603878759e-09,
      "kl": 0.046875,
      "learning_rate": 1.2044960518417902e-05,
      "loss": 0.0019,
      "num_tokens": 99983321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1474
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49166666666666664,
      "grad_norm": 2.123553333177597e-09,
      "kl": 0.048828125,
      "learning_rate": 1.2033569497448306e-05,
      "loss": 0.002,
      "num_tokens": 100064793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1475
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.492,
      "grad_norm": 2.2304345037582607e-09,
      "kl": 0.04730224609375,
      "learning_rate": 1.2022175723320382e-05,
      "loss": 0.0019,
      "num_tokens": 100141961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1476
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49233333333333335,
      "grad_norm": 1.4254857294559997e-09,
      "kl": 0.04522705078125,
      "learning_rate": 1.2010779211459649e-05,
      "loss": 0.0018,
      "num_tokens": 100217657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1477
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49266666666666664,
      "grad_norm": 1.55117751976519e-09,
      "kl": 0.04437255859375,
      "learning_rate": 1.1999379977295334e-05,
      "loss": 0.0018,
      "num_tokens": 100292649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1478
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.493,
      "grad_norm": 8.613340041208062e-10,
      "kl": 0.04339599609375,
      "learning_rate": 1.1987978036260346e-05,
      "loss": 0.0017,
      "num_tokens": 100366009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1479
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49333333333333335,
      "grad_norm": 1.5746163262164714e-09,
      "kl": 0.0487060546875,
      "learning_rate": 1.1976573403791263e-05,
      "loss": 0.0019,
      "num_tokens": 100441049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1480
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49366666666666664,
      "grad_norm": 1.8050065886399125e-09,
      "kl": 0.04571533203125,
      "learning_rate": 1.1965166095328302e-05,
      "loss": 0.0018,
      "num_tokens": 100518985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1481
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.494,
      "grad_norm": 1.8187547023984507e-09,
      "kl": 0.0498046875,
      "learning_rate": 1.1953756126315306e-05,
      "loss": 0.002,
      "num_tokens": 100597065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1482
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49433333333333335,
      "grad_norm": 1.4223089372933373e-09,
      "kl": 0.0433349609375,
      "learning_rate": 1.194234351219972e-05,
      "loss": 0.0017,
      "num_tokens": 100671449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1483
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49466666666666664,
      "grad_norm": 1.969487239961154e-09,
      "kl": 0.0477294921875,
      "learning_rate": 1.1930928268432569e-05,
      "loss": 0.0019,
      "num_tokens": 100747593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1484
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.495,
      "grad_norm": 1.3811733978741358e-09,
      "kl": 0.043701171875,
      "learning_rate": 1.1919510410468435e-05,
      "loss": 0.0017,
      "num_tokens": 100821961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1485
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49533333333333335,
      "grad_norm": 1.7434232946200723e-09,
      "kl": 0.04718017578125,
      "learning_rate": 1.190808995376545e-05,
      "loss": 0.0019,
      "num_tokens": 100898217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1486
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49566666666666664,
      "grad_norm": 1.7419132802842796e-09,
      "kl": 0.048828125,
      "learning_rate": 1.1896666913785248e-05,
      "loss": 0.002,
      "num_tokens": 100973625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1487
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.496,
      "grad_norm": 1.2846314012548987e-09,
      "kl": 0.04583740234375,
      "learning_rate": 1.1885241305992976e-05,
      "loss": 0.0018,
      "num_tokens": 101050489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1488
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49633333333333335,
      "grad_norm": 1.8690116121433675e-09,
      "kl": 0.046142578125,
      "learning_rate": 1.187381314585725e-05,
      "loss": 0.0018,
      "num_tokens": 101129929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1489
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49666666666666665,
      "grad_norm": 2.1334294331154524e-09,
      "kl": 0.04766845703125,
      "learning_rate": 1.1862382448850136e-05,
      "loss": 0.0019,
      "num_tokens": 101206057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1490
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.497,
      "grad_norm": 1.670320215474419e-09,
      "kl": 0.048828125,
      "learning_rate": 1.1850949230447146e-05,
      "loss": 0.0019,
      "num_tokens": 101281177.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1491
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49733333333333335,
      "grad_norm": 3.2869327259987813e-09,
      "kl": 0.04791259765625,
      "learning_rate": 1.1839513506127202e-05,
      "loss": 0.0019,
      "num_tokens": 101359113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1492
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49766666666666665,
      "grad_norm": 9.076445706135416e-10,
      "kl": 0.04412841796875,
      "learning_rate": 1.1828075291372616e-05,
      "loss": 0.0018,
      "num_tokens": 101437545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1493
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.498,
      "grad_norm": 8.783641591847413e-10,
      "kl": 0.04290771484375,
      "learning_rate": 1.181663460166907e-05,
      "loss": 0.0017,
      "num_tokens": 101511961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1494
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49833333333333335,
      "grad_norm": 1.6328642882257327e-09,
      "kl": 0.0435791015625,
      "learning_rate": 1.1805191452505602e-05,
      "loss": 0.0017,
      "num_tokens": 101588121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1495
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49866666666666665,
      "grad_norm": 1.7981141020584346e-09,
      "kl": 0.0478515625,
      "learning_rate": 1.1793745859374575e-05,
      "loss": 0.0019,
      "num_tokens": 101663801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1496
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.499,
      "grad_norm": 1.9772374848514573e-09,
      "kl": 0.0482177734375,
      "learning_rate": 1.1782297837771668e-05,
      "loss": 0.0019,
      "num_tokens": 101739753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1497
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49933333333333335,
      "grad_norm": 1.7462925550049135e-09,
      "kl": 0.04656982421875,
      "learning_rate": 1.1770847403195836e-05,
      "loss": 0.0019,
      "num_tokens": 101813657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1498
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.49966666666666665,
      "grad_norm": 1.8387565914324e-09,
      "kl": 0.04473876953125,
      "learning_rate": 1.175939457114931e-05,
      "loss": 0.0018,
      "num_tokens": 101889449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1499
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5,
      "grad_norm": 1.1449774461880224e-09,
      "kl": 0.04833984375,
      "learning_rate": 1.1747939357137568e-05,
      "loss": 0.0019,
      "num_tokens": 101964137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1500
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5003333333333333,
      "grad_norm": 1.7043400024618904e-09,
      "kl": 0.042724609375,
      "learning_rate": 1.1736481776669307e-05,
      "loss": 0.0017,
      "num_tokens": 102045065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1501
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5006666666666667,
      "grad_norm": 1.949060024486471e-09,
      "kl": 0.04730224609375,
      "learning_rate": 1.1725021845256426e-05,
      "loss": 0.0019,
      "num_tokens": 102120873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1502
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.501,
      "grad_norm": 1.60807422933118e-09,
      "kl": 0.0457763671875,
      "learning_rate": 1.171355957841402e-05,
      "loss": 0.0018,
      "num_tokens": 102198473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1503
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5013333333333333,
      "grad_norm": 1.6719630124839568e-09,
      "kl": 0.04345703125,
      "learning_rate": 1.1702094991660326e-05,
      "loss": 0.0017,
      "num_tokens": 102275065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1504
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5016666666666667,
      "grad_norm": 1.3148617750147196e-09,
      "kl": 0.047607421875,
      "learning_rate": 1.169062810051674e-05,
      "loss": 0.0019,
      "num_tokens": 102350153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1505
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.502,
      "grad_norm": 1.6983747741505795e-09,
      "kl": 0.04461669921875,
      "learning_rate": 1.1679158920507773e-05,
      "loss": 0.0018,
      "num_tokens": 102424297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1506
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5023333333333333,
      "grad_norm": 1.5975210043706056e-09,
      "kl": 0.0452880859375,
      "learning_rate": 1.1667687467161025e-05,
      "loss": 0.0018,
      "num_tokens": 102500105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1507
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5026666666666667,
      "grad_norm": 1.7908122762477774e-09,
      "kl": 0.04296875,
      "learning_rate": 1.1656213756007184e-05,
      "loss": 0.0017,
      "num_tokens": 102575481.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1508
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.503,
      "grad_norm": 1.2947768412985283e-09,
      "kl": 0.046142578125,
      "learning_rate": 1.1644737802579989e-05,
      "loss": 0.0018,
      "num_tokens": 102650009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1509
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5033333333333333,
      "grad_norm": 1.1567885538354972e-09,
      "kl": 0.0452880859375,
      "learning_rate": 1.1633259622416224e-05,
      "loss": 0.0018,
      "num_tokens": 102724025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1510
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5036666666666667,
      "grad_norm": 2.3768287338299388e-09,
      "kl": 0.04443359375,
      "learning_rate": 1.1621779231055677e-05,
      "loss": 0.0018,
      "num_tokens": 102799273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1511
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.504,
      "grad_norm": 1.5762814387088042e-09,
      "kl": 0.04400634765625,
      "learning_rate": 1.161029664404113e-05,
      "loss": 0.0018,
      "num_tokens": 102874105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1512
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5043333333333333,
      "grad_norm": 2.59026933235873e-09,
      "kl": 0.04815673828125,
      "learning_rate": 1.159881187691835e-05,
      "loss": 0.0019,
      "num_tokens": 102952185.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1513
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5046666666666667,
      "grad_norm": 1.1417922163303729e-09,
      "kl": 0.04241943359375,
      "learning_rate": 1.158732494523604e-05,
      "loss": 0.0017,
      "num_tokens": 103026521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1514
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.505,
      "grad_norm": 1.2209561139897573e-09,
      "kl": 0.04296875,
      "learning_rate": 1.1575835864545844e-05,
      "loss": 0.0017,
      "num_tokens": 103102169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1515
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5053333333333333,
      "grad_norm": 2.1902164526466095e-09,
      "kl": 0.04669189453125,
      "learning_rate": 1.156434465040231e-05,
      "loss": 0.0019,
      "num_tokens": 103177225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1516
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5056666666666667,
      "grad_norm": 1.4495937783465251e-09,
      "kl": 0.0479736328125,
      "learning_rate": 1.1552851318362876e-05,
      "loss": 0.0019,
      "num_tokens": 103252441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1517
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.506,
      "grad_norm": 1.3732713854963663e-09,
      "kl": 0.04290771484375,
      "learning_rate": 1.154135588398785e-05,
      "loss": 0.0017,
      "num_tokens": 103328313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1518
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5063333333333333,
      "grad_norm": 1.4534400349930365e-09,
      "kl": 0.04730224609375,
      "learning_rate": 1.1529858362840383e-05,
      "loss": 0.0019,
      "num_tokens": 103403433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1519
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5066666666666667,
      "grad_norm": 9.694033353824238e-10,
      "kl": 0.0457763671875,
      "learning_rate": 1.151835877048645e-05,
      "loss": 0.0018,
      "num_tokens": 103477401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1520
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.507,
      "grad_norm": 1.4473785503454906e-09,
      "kl": 0.04449462890625,
      "learning_rate": 1.1506857122494832e-05,
      "loss": 0.0018,
      "num_tokens": 103557209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1521
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5073333333333333,
      "grad_norm": 1.565738094733149e-09,
      "kl": 0.04644775390625,
      "learning_rate": 1.1495353434437098e-05,
      "loss": 0.0019,
      "num_tokens": 103631513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1522
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5076666666666667,
      "grad_norm": 1.6531110924589143e-09,
      "kl": 0.0491943359375,
      "learning_rate": 1.1483847721887567e-05,
      "loss": 0.002,
      "num_tokens": 103706553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1523
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.508,
      "grad_norm": 1.7490170423073437e-09,
      "kl": 0.04669189453125,
      "learning_rate": 1.1472340000423313e-05,
      "loss": 0.0019,
      "num_tokens": 103779753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1524
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5083333333333333,
      "grad_norm": 1.7742116664720697e-09,
      "kl": 0.0445556640625,
      "learning_rate": 1.1460830285624119e-05,
      "loss": 0.0018,
      "num_tokens": 103859417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1525
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5086666666666667,
      "grad_norm": 1.2627474621496049e-09,
      "kl": 0.0455322265625,
      "learning_rate": 1.1449318593072468e-05,
      "loss": 0.0018,
      "num_tokens": 103937177.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1526
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.509,
      "grad_norm": 9.279628176983579e-10,
      "kl": 0.04559326171875,
      "learning_rate": 1.143780493835353e-05,
      "loss": 0.0018,
      "num_tokens": 104010777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1527
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5093333333333333,
      "grad_norm": 1.976980801288164e-09,
      "kl": 0.04974365234375,
      "learning_rate": 1.1426289337055119e-05,
      "loss": 0.002,
      "num_tokens": 104087145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1528
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5096666666666667,
      "grad_norm": 1.201163279951345e-09,
      "kl": 0.04742431640625,
      "learning_rate": 1.141477180476769e-05,
      "loss": 0.0019,
      "num_tokens": 104162793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1529
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.51,
      "grad_norm": 2.458331094246091e-09,
      "kl": 0.0447998046875,
      "learning_rate": 1.1403252357084315e-05,
      "loss": 0.0018,
      "num_tokens": 104241001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1530
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5103333333333333,
      "grad_norm": 1.587015741044695e-09,
      "kl": 0.04559326171875,
      "learning_rate": 1.1391731009600655e-05,
      "loss": 0.0018,
      "num_tokens": 104316841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1531
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5106666666666667,
      "grad_norm": 1.4630648914604194e-09,
      "kl": 0.042724609375,
      "learning_rate": 1.1380207777914946e-05,
      "loss": 0.0017,
      "num_tokens": 104392121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1532
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.511,
      "grad_norm": 1.488992262821398e-09,
      "kl": 0.04449462890625,
      "learning_rate": 1.1368682677627971e-05,
      "loss": 0.0018,
      "num_tokens": 104466873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1533
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5113333333333333,
      "grad_norm": 1.585642728230141e-09,
      "kl": 0.04608154296875,
      "learning_rate": 1.1357155724343046e-05,
      "loss": 0.0018,
      "num_tokens": 104540873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1534
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5116666666666667,
      "grad_norm": 1.531700433154981e-09,
      "kl": 0.04547119140625,
      "learning_rate": 1.1345626933665996e-05,
      "loss": 0.0018,
      "num_tokens": 104614393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1535
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.512,
      "grad_norm": 1.9156807251619057e-09,
      "kl": 0.05059814453125,
      "learning_rate": 1.1334096321205129e-05,
      "loss": 0.002,
      "num_tokens": 104690889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1536
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5123333333333333,
      "grad_norm": 3.2628990620509057e-09,
      "kl": 0.044921875,
      "learning_rate": 1.1322563902571227e-05,
      "loss": 0.0018,
      "num_tokens": 104767577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1537
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5126666666666667,
      "grad_norm": 1.294423901399e-09,
      "kl": 0.04449462890625,
      "learning_rate": 1.1311029693377511e-05,
      "loss": 0.0018,
      "num_tokens": 104841161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1538
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.513,
      "grad_norm": 2.1832582408620738e-09,
      "kl": 0.043212890625,
      "learning_rate": 1.1299493709239628e-05,
      "loss": 0.0017,
      "num_tokens": 104918233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1539
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5133333333333333,
      "grad_norm": 2.7833639837382407e-09,
      "kl": 0.0458984375,
      "learning_rate": 1.128795596577563e-05,
      "loss": 0.0018,
      "num_tokens": 104994233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1540
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5136666666666667,
      "grad_norm": 1.2344314459511452e-09,
      "kl": 0.04583740234375,
      "learning_rate": 1.127641647860595e-05,
      "loss": 0.0018,
      "num_tokens": 105068297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1541
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.514,
      "grad_norm": 2.1082129375571412e-09,
      "kl": 0.04412841796875,
      "learning_rate": 1.1264875263353375e-05,
      "loss": 0.0018,
      "num_tokens": 105143865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1542
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5143333333333333,
      "grad_norm": 2.350319716626359e-09,
      "kl": 0.04864501953125,
      "learning_rate": 1.1253332335643043e-05,
      "loss": 0.0019,
      "num_tokens": 105219481.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1543
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5146666666666667,
      "grad_norm": 1.685978023857615e-09,
      "kl": 0.0474853515625,
      "learning_rate": 1.1241787711102405e-05,
      "loss": 0.0019,
      "num_tokens": 105294489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1544
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.515,
      "grad_norm": 1.740410704442752e-09,
      "kl": 0.042724609375,
      "learning_rate": 1.1230241405361209e-05,
      "loss": 0.0017,
      "num_tokens": 105369945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1545
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5153333333333333,
      "grad_norm": 2.745652816216193e-09,
      "kl": 0.04925537109375,
      "learning_rate": 1.1218693434051475e-05,
      "loss": 0.002,
      "num_tokens": 105445465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1546
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5156666666666667,
      "grad_norm": 2.3614223909618204e-09,
      "kl": 0.0479736328125,
      "learning_rate": 1.1207143812807489e-05,
      "loss": 0.0019,
      "num_tokens": 105523945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1547
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.516,
      "grad_norm": 2.0211039508666317e-09,
      "kl": 0.049072265625,
      "learning_rate": 1.1195592557265757e-05,
      "loss": 0.002,
      "num_tokens": 105598361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1548
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5163333333333333,
      "grad_norm": 1.0911613834707623e-09,
      "kl": 0.044921875,
      "learning_rate": 1.1184039683065014e-05,
      "loss": 0.0018,
      "num_tokens": 105673401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1549
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5166666666666667,
      "grad_norm": 1.3093715001133432e-09,
      "kl": 0.0445556640625,
      "learning_rate": 1.1172485205846161e-05,
      "loss": 0.0018,
      "num_tokens": 105746281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1550
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.517,
      "grad_norm": 1.5854247914504072e-09,
      "kl": 0.04217529296875,
      "learning_rate": 1.1160929141252303e-05,
      "loss": 0.0017,
      "num_tokens": 105820873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1551
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5173333333333333,
      "grad_norm": 1.7735748425451447e-09,
      "kl": 0.0447998046875,
      "learning_rate": 1.1149371504928667e-05,
      "loss": 0.0018,
      "num_tokens": 105898697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1552
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5176666666666667,
      "grad_norm": 1.7111051464624438e-09,
      "kl": 0.0458984375,
      "learning_rate": 1.1137812312522618e-05,
      "loss": 0.0018,
      "num_tokens": 105975001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1553
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.518,
      "grad_norm": 1.2639812529968708e-09,
      "kl": 0.04449462890625,
      "learning_rate": 1.112625157968363e-05,
      "loss": 0.0018,
      "num_tokens": 106050265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1554
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5183333333333333,
      "grad_norm": 1.3364678252969497e-09,
      "kl": 0.04571533203125,
      "learning_rate": 1.1114689322063255e-05,
      "loss": 0.0018,
      "num_tokens": 106125721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1555
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5186666666666667,
      "grad_norm": 2.1918862280756457e-09,
      "kl": 0.047607421875,
      "learning_rate": 1.110312555531512e-05,
      "loss": 0.0019,
      "num_tokens": 106202105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1556
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.519,
      "grad_norm": 2.3662052317519056e-09,
      "kl": 0.0491943359375,
      "learning_rate": 1.109156029509488e-05,
      "loss": 0.002,
      "num_tokens": 106283257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1557
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5193333333333333,
      "grad_norm": 1.356117107498278e-09,
      "kl": 0.0482177734375,
      "learning_rate": 1.1079993557060228e-05,
      "loss": 0.0019,
      "num_tokens": 106356857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1558
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5196666666666667,
      "grad_norm": 1.7797547879894182e-09,
      "kl": 0.044921875,
      "learning_rate": 1.1068425356870853e-05,
      "loss": 0.0018,
      "num_tokens": 106430921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1559
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.52,
      "grad_norm": 6.506926553129233e-09,
      "kl": 0.05059814453125,
      "learning_rate": 1.1056855710188413e-05,
      "loss": 0.002,
      "num_tokens": 106514425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1560
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5203333333333333,
      "grad_norm": 2.244831209807785e-09,
      "kl": 0.046142578125,
      "learning_rate": 1.1045284632676535e-05,
      "loss": 0.0018,
      "num_tokens": 106590777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1561
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5206666666666667,
      "grad_norm": 2.2646711173024414e-09,
      "kl": 0.04278564453125,
      "learning_rate": 1.1033712140000787e-05,
      "loss": 0.0017,
      "num_tokens": 106667401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1562
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.521,
      "grad_norm": 1.2767026325022357e-09,
      "kl": 0.04571533203125,
      "learning_rate": 1.1022138247828638e-05,
      "loss": 0.0018,
      "num_tokens": 106742041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1563
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5213333333333333,
      "grad_norm": 1.7591560430574305e-09,
      "kl": 0.0428466796875,
      "learning_rate": 1.1010562971829464e-05,
      "loss": 0.0017,
      "num_tokens": 106818857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1564
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5216666666666666,
      "grad_norm": 1.4238425993795545e-09,
      "kl": 0.04779052734375,
      "learning_rate": 1.0998986327674515e-05,
      "loss": 0.0019,
      "num_tokens": 106893625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1565
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.522,
      "grad_norm": 1.7487855608067093e-09,
      "kl": 0.04510498046875,
      "learning_rate": 1.0987408331036879e-05,
      "loss": 0.0018,
      "num_tokens": 106971113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1566
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5223333333333333,
      "grad_norm": 1.294450546751591e-09,
      "kl": 0.04144287109375,
      "learning_rate": 1.0975828997591496e-05,
      "loss": 0.0017,
      "num_tokens": 107043545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1567
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5226666666666666,
      "grad_norm": 2.2019452927679595e-09,
      "kl": 0.04510498046875,
      "learning_rate": 1.09642483430151e-05,
      "loss": 0.0018,
      "num_tokens": 107117369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1568
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.523,
      "grad_norm": 1.363847812463348e-09,
      "kl": 0.047119140625,
      "learning_rate": 1.0952666382986216e-05,
      "loss": 0.0019,
      "num_tokens": 107194345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1569
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5233333333333333,
      "grad_norm": 2.162380052794788e-09,
      "kl": 0.0447998046875,
      "learning_rate": 1.0941083133185146e-05,
      "loss": 0.0018,
      "num_tokens": 107272953.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1570
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5236666666666666,
      "grad_norm": 1.596255350122533e-09,
      "kl": 0.046875,
      "learning_rate": 1.0929498609293925e-05,
      "loss": 0.0019,
      "num_tokens": 107347769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1571
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.524,
      "grad_norm": 1.5163403865869896e-09,
      "kl": 0.0450439453125,
      "learning_rate": 1.0917912826996319e-05,
      "loss": 0.0018,
      "num_tokens": 107423145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1572
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5243333333333333,
      "grad_norm": 1.702663454672404e-09,
      "kl": 0.0465087890625,
      "learning_rate": 1.0906325801977804e-05,
      "loss": 0.0019,
      "num_tokens": 107499449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1573
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5246666666666666,
      "grad_norm": 1.6652705880915164e-09,
      "kl": 0.04718017578125,
      "learning_rate": 1.0894737549925525e-05,
      "loss": 0.0019,
      "num_tokens": 107576729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1574
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.525,
      "grad_norm": 1.6206673780772007e-09,
      "kl": 0.044189453125,
      "learning_rate": 1.08831480865283e-05,
      "loss": 0.0018,
      "num_tokens": 107651801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1575
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5253333333333333,
      "grad_norm": 2.1230595059762436e-09,
      "kl": 0.04888916015625,
      "learning_rate": 1.0871557427476585e-05,
      "loss": 0.002,
      "num_tokens": 107728089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1576
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5256666666666666,
      "grad_norm": 3.244532420509927e-09,
      "kl": 0.04852294921875,
      "learning_rate": 1.0859965588462442e-05,
      "loss": 0.0019,
      "num_tokens": 107806073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1577
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.526,
      "grad_norm": 1.3243509622284932e-09,
      "kl": 0.04803466796875,
      "learning_rate": 1.0848372585179552e-05,
      "loss": 0.0019,
      "num_tokens": 107885241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1578
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5263333333333333,
      "grad_norm": 2.8321101019912476e-09,
      "kl": 0.04473876953125,
      "learning_rate": 1.083677843332316e-05,
      "loss": 0.0018,
      "num_tokens": 107961081.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1579
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5266666666666666,
      "grad_norm": 1.0748488765699449e-09,
      "kl": 0.04559326171875,
      "learning_rate": 1.0825183148590055e-05,
      "loss": 0.0018,
      "num_tokens": 108035849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1580
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.527,
      "grad_norm": 1.0800532690424802e-09,
      "kl": 0.0411376953125,
      "learning_rate": 1.0813586746678584e-05,
      "loss": 0.0016,
      "num_tokens": 108113433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1581
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5273333333333333,
      "grad_norm": 2.0653483367993886e-09,
      "kl": 0.04840087890625,
      "learning_rate": 1.0801989243288588e-05,
      "loss": 0.0019,
      "num_tokens": 108190537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1582
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5276666666666666,
      "grad_norm": 2.1389061632959283e-09,
      "kl": 0.04339599609375,
      "learning_rate": 1.0790390654121414e-05,
      "loss": 0.0017,
      "num_tokens": 108266617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1583
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.528,
      "grad_norm": 1.8402083190593999e-09,
      "kl": 0.0478515625,
      "learning_rate": 1.077879099487986e-05,
      "loss": 0.0019,
      "num_tokens": 108343481.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1584
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5283333333333333,
      "grad_norm": 2.042461533235951e-09,
      "kl": 0.0423583984375,
      "learning_rate": 1.0767190281268187e-05,
      "loss": 0.0017,
      "num_tokens": 108419705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1585
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5286666666666666,
      "grad_norm": 1.421043394067567e-09,
      "kl": 0.04241943359375,
      "learning_rate": 1.0755588528992082e-05,
      "loss": 0.0017,
      "num_tokens": 108498633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1586
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.529,
      "grad_norm": 2.2432247170911523e-09,
      "kl": 0.04119873046875,
      "learning_rate": 1.0743985753758636e-05,
      "loss": 0.0016,
      "num_tokens": 108579465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1587
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5293333333333333,
      "grad_norm": 1.591195064598594e-09,
      "kl": 0.04156494140625,
      "learning_rate": 1.0732381971276318e-05,
      "loss": 0.0017,
      "num_tokens": 108655993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1588
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5296666666666666,
      "grad_norm": 1.6709698069661272e-09,
      "kl": 0.04559326171875,
      "learning_rate": 1.0720777197254974e-05,
      "loss": 0.0018,
      "num_tokens": 108730041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1589
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.53,
      "grad_norm": 1.7509981242724848e-09,
      "kl": 0.04241943359375,
      "learning_rate": 1.0709171447405786e-05,
      "loss": 0.0017,
      "num_tokens": 108809193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1590
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5303333333333333,
      "grad_norm": 1.7802064267158357e-09,
      "kl": 0.04638671875,
      "learning_rate": 1.0697564737441254e-05,
      "loss": 0.0019,
      "num_tokens": 108885529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1591
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5306666666666666,
      "grad_norm": 1.1593932480735702e-09,
      "kl": 0.0460205078125,
      "learning_rate": 1.0685957083075182e-05,
      "loss": 0.0018,
      "num_tokens": 108958921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1592
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.531,
      "grad_norm": 1.6403380875829043e-09,
      "kl": 0.046142578125,
      "learning_rate": 1.0674348500022653e-05,
      "loss": 0.0018,
      "num_tokens": 109033721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1593
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5313333333333333,
      "grad_norm": 3.140320004035857e-09,
      "kl": 0.0452880859375,
      "learning_rate": 1.0662739004000005e-05,
      "loss": 0.0018,
      "num_tokens": 109109273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1594
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5316666666666666,
      "grad_norm": 2.002084942276383e-09,
      "kl": 0.0445556640625,
      "learning_rate": 1.0651128610724808e-05,
      "loss": 0.0018,
      "num_tokens": 109188265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1595
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.532,
      "grad_norm": 1.3694785305773394e-09,
      "kl": 0.0467529296875,
      "learning_rate": 1.0639517335915857e-05,
      "loss": 0.0019,
      "num_tokens": 109263497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1596
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5323333333333333,
      "grad_norm": 2.0904051822867586e-09,
      "kl": 0.04437255859375,
      "learning_rate": 1.0627905195293135e-05,
      "loss": 0.0018,
      "num_tokens": 109340073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1597
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5326666666666666,
      "grad_norm": 1.9008772333961588e-09,
      "kl": 0.04156494140625,
      "learning_rate": 1.0616292204577796e-05,
      "loss": 0.0017,
      "num_tokens": 109422873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1598
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.533,
      "grad_norm": 1.6173647976458483e-09,
      "kl": 0.0447998046875,
      "learning_rate": 1.0604678379492143e-05,
      "loss": 0.0018,
      "num_tokens": 109497785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1599
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5333333333333333,
      "grad_norm": 1.5989894963652773e-09,
      "kl": 0.0478515625,
      "learning_rate": 1.0593063735759619e-05,
      "loss": 0.0019,
      "num_tokens": 109571801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1600
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5336666666666666,
      "grad_norm": 2.0446220272418714e-09,
      "kl": 0.0411376953125,
      "learning_rate": 1.0581448289104759e-05,
      "loss": 0.0016,
      "num_tokens": 109648329.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1601
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.534,
      "grad_norm": 2.5829700600610295e-09,
      "kl": 0.042724609375,
      "learning_rate": 1.05698320552532e-05,
      "loss": 0.0017,
      "num_tokens": 109726841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1602
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5343333333333333,
      "grad_norm": 2.544499722034743e-09,
      "kl": 0.0421142578125,
      "learning_rate": 1.055821504993164e-05,
      "loss": 0.0017,
      "num_tokens": 109806873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1603
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5346666666666666,
      "grad_norm": 1.5378008866306914e-09,
      "kl": 0.0489501953125,
      "learning_rate": 1.0546597288867815e-05,
      "loss": 0.002,
      "num_tokens": 109881977.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1604
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.535,
      "grad_norm": 2.2183384018603647e-09,
      "kl": 0.04217529296875,
      "learning_rate": 1.0534978787790494e-05,
      "loss": 0.0017,
      "num_tokens": 109962073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1605
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5353333333333333,
      "grad_norm": 1.3582489577501633e-09,
      "kl": 0.04730224609375,
      "learning_rate": 1.0523359562429441e-05,
      "loss": 0.0019,
      "num_tokens": 110035881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1606
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5356666666666666,
      "grad_norm": 1.8569572546311974e-09,
      "kl": 0.0443115234375,
      "learning_rate": 1.0511739628515402e-05,
      "loss": 0.0018,
      "num_tokens": 110112265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1607
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.536,
      "grad_norm": 1.3517668095985869e-09,
      "kl": 0.04412841796875,
      "learning_rate": 1.0500119001780085e-05,
      "loss": 0.0018,
      "num_tokens": 110187321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1608
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5363333333333333,
      "grad_norm": 1.730772858365981e-09,
      "kl": 0.0443115234375,
      "learning_rate": 1.0488497697956134e-05,
      "loss": 0.0018,
      "num_tokens": 110263097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1609
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5366666666666666,
      "grad_norm": 1.7241058580808044e-09,
      "kl": 0.0472412109375,
      "learning_rate": 1.047687573277711e-05,
      "loss": 0.0019,
      "num_tokens": 110339033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1610
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.537,
      "grad_norm": 1.2435815710531983e-09,
      "kl": 0.048583984375,
      "learning_rate": 1.046525312197747e-05,
      "loss": 0.0019,
      "num_tokens": 110413353.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1611
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5373333333333333,
      "grad_norm": 2.284982203448749e-09,
      "kl": 0.046142578125,
      "learning_rate": 1.0453629881292537e-05,
      "loss": 0.0018,
      "num_tokens": 110491769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1612
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5376666666666666,
      "grad_norm": 2.6038804445960295e-09,
      "kl": 0.0460205078125,
      "learning_rate": 1.0442006026458506e-05,
      "loss": 0.0018,
      "num_tokens": 110566393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1613
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.538,
      "grad_norm": 1.7037118382745575e-09,
      "kl": 0.04541015625,
      "learning_rate": 1.0430381573212385e-05,
      "loss": 0.0018,
      "num_tokens": 110642505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1614
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5383333333333333,
      "grad_norm": 3.1982430037658105e-09,
      "kl": 0.04730224609375,
      "learning_rate": 1.0418756537291996e-05,
      "loss": 0.0019,
      "num_tokens": 110719049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1615
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5386666666666666,
      "grad_norm": 1.6670455016409846e-09,
      "kl": 0.0474853515625,
      "learning_rate": 1.040713093443596e-05,
      "loss": 0.0019,
      "num_tokens": 110793641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1616
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.539,
      "grad_norm": 1.5777880113532206e-09,
      "kl": 0.0450439453125,
      "learning_rate": 1.0395504780383653e-05,
      "loss": 0.0018,
      "num_tokens": 110868873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1617
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5393333333333333,
      "grad_norm": 1.6516649159470376e-09,
      "kl": 0.044677734375,
      "learning_rate": 1.03838780908752e-05,
      "loss": 0.0018,
      "num_tokens": 110941993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1618
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5396666666666666,
      "grad_norm": 1.9598369593865073e-09,
      "kl": 0.046630859375,
      "learning_rate": 1.037225088165146e-05,
      "loss": 0.0019,
      "num_tokens": 111018313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1619
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.54,
      "grad_norm": 1.3101654205982527e-09,
      "kl": 0.04534912109375,
      "learning_rate": 1.0360623168453982e-05,
      "loss": 0.0018,
      "num_tokens": 111094585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1620
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5403333333333333,
      "grad_norm": 1.16604326194647e-09,
      "kl": 0.04608154296875,
      "learning_rate": 1.0348994967025012e-05,
      "loss": 0.0018,
      "num_tokens": 111169129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1621
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5406666666666666,
      "grad_norm": 1.2935215121245847e-09,
      "kl": 0.046875,
      "learning_rate": 1.0337366293107441e-05,
      "loss": 0.0019,
      "num_tokens": 111244009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1622
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.541,
      "grad_norm": 1.346747824371164e-09,
      "kl": 0.0489501953125,
      "learning_rate": 1.0325737162444813e-05,
      "loss": 0.002,
      "num_tokens": 111319961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1623
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5413333333333333,
      "grad_norm": 1.6689208903741815e-09,
      "kl": 0.04791259765625,
      "learning_rate": 1.0314107590781284e-05,
      "loss": 0.0019,
      "num_tokens": 111396745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1624
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5416666666666666,
      "grad_norm": 1.3106982166277703e-09,
      "kl": 0.04718017578125,
      "learning_rate": 1.0302477593861608e-05,
      "loss": 0.0019,
      "num_tokens": 111470713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1625
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.542,
      "grad_norm": 1.0043521569969016e-09,
      "kl": 0.04290771484375,
      "learning_rate": 1.0290847187431115e-05,
      "loss": 0.0017,
      "num_tokens": 111543417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1626
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5423333333333333,
      "grad_norm": 1.5507116701840573e-09,
      "kl": 0.04278564453125,
      "learning_rate": 1.0279216387235691e-05,
      "loss": 0.0017,
      "num_tokens": 111619385.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1627
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5426666666666666,
      "grad_norm": 1.8860577544188573e-09,
      "kl": 0.04736328125,
      "learning_rate": 1.0267585209021748e-05,
      "loss": 0.0019,
      "num_tokens": 111697401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1628
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.543,
      "grad_norm": 1.6815474568332434e-09,
      "kl": 0.04559326171875,
      "learning_rate": 1.0255953668536223e-05,
      "loss": 0.0018,
      "num_tokens": 111773385.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1629
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5433333333333333,
      "grad_norm": 2.135041476947208e-09,
      "kl": 0.0477294921875,
      "learning_rate": 1.0244321781526533e-05,
      "loss": 0.0019,
      "num_tokens": 111849833.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1630
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5436666666666666,
      "grad_norm": 1.212184019827589e-09,
      "kl": 0.045654296875,
      "learning_rate": 1.0232689563740563e-05,
      "loss": 0.0018,
      "num_tokens": 111924185.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1631
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.544,
      "grad_norm": 1.1712861791579599e-09,
      "kl": 0.046630859375,
      "learning_rate": 1.0221057030926657e-05,
      "loss": 0.0019,
      "num_tokens": 111999625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1632
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5443333333333333,
      "grad_norm": 1.0637560832194026e-09,
      "kl": 0.0472412109375,
      "learning_rate": 1.0209424198833571e-05,
      "loss": 0.0019,
      "num_tokens": 112073305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1633
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5446666666666666,
      "grad_norm": 1.7328191104226676e-09,
      "kl": 0.0465087890625,
      "learning_rate": 1.0197791083210478e-05,
      "loss": 0.0019,
      "num_tokens": 112150649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1634
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.545,
      "grad_norm": 1.9609580625967737e-09,
      "kl": 0.040557861328125,
      "learning_rate": 1.0186157699806928e-05,
      "loss": 0.0016,
      "num_tokens": 112226681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1635
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5453333333333333,
      "grad_norm": 1.1781242648112311e-09,
      "kl": 0.04388427734375,
      "learning_rate": 1.0174524064372837e-05,
      "loss": 0.0018,
      "num_tokens": 112300889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1636
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5456666666666666,
      "grad_norm": 1.7359098603009215e-09,
      "kl": 0.04547119140625,
      "learning_rate": 1.0162890192658459e-05,
      "loss": 0.0018,
      "num_tokens": 112376905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1637
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.546,
      "grad_norm": 3.3030778112674852e-09,
      "kl": 0.0445556640625,
      "learning_rate": 1.0151256100414375e-05,
      "loss": 0.0018,
      "num_tokens": 112454249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1638
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5463333333333333,
      "grad_norm": 2.270676535687244e-09,
      "kl": 0.043212890625,
      "learning_rate": 1.0139621803391454e-05,
      "loss": 0.0017,
      "num_tokens": 112529961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1639
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5466666666666666,
      "grad_norm": 2.0575796622068765e-09,
      "kl": 0.04351806640625,
      "learning_rate": 1.0127987317340851e-05,
      "loss": 0.0017,
      "num_tokens": 112608361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1640
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.547,
      "grad_norm": 1.5570634781525428e-09,
      "kl": 0.044189453125,
      "learning_rate": 1.0116352658013973e-05,
      "loss": 0.0018,
      "num_tokens": 112681593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1641
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5473333333333333,
      "grad_norm": 1.2765823953486688e-09,
      "kl": 0.0439453125,
      "learning_rate": 1.010471784116246e-05,
      "loss": 0.0018,
      "num_tokens": 112758265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1642
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5476666666666666,
      "grad_norm": 1.8057595418952133e-09,
      "kl": 0.047607421875,
      "learning_rate": 1.009308288253817e-05,
      "loss": 0.0019,
      "num_tokens": 112833545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1643
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.548,
      "grad_norm": 2.8282689523706495e-09,
      "kl": 0.0478515625,
      "learning_rate": 1.0081447797893149e-05,
      "loss": 0.0019,
      "num_tokens": 112909401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1644
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5483333333333333,
      "grad_norm": 2.4261828102112304e-09,
      "kl": 0.04705810546875,
      "learning_rate": 1.0069812602979617e-05,
      "loss": 0.0019,
      "num_tokens": 112987273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1645
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5486666666666666,
      "grad_norm": 1.696069062973038e-09,
      "kl": 0.04608154296875,
      "learning_rate": 1.005817731354994e-05,
      "loss": 0.0018,
      "num_tokens": 113061801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1646
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.549,
      "grad_norm": 2.684956257326121e-09,
      "kl": 0.0438232421875,
      "learning_rate": 1.0046541945356613e-05,
      "loss": 0.0018,
      "num_tokens": 113140345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1647
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5493333333333333,
      "grad_norm": 1.3716794367013563e-09,
      "kl": 0.04547119140625,
      "learning_rate": 1.0034906514152239e-05,
      "loss": 0.0018,
      "num_tokens": 113216153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1648
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5496666666666666,
      "grad_norm": 1.4945731319215838e-09,
      "kl": 0.0452880859375,
      "learning_rate": 1.0023271035689506e-05,
      "loss": 0.0018,
      "num_tokens": 113291289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1649
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.55,
      "grad_norm": 1.4617590471388553e-09,
      "kl": 0.04071044921875,
      "learning_rate": 1.001163552572116e-05,
      "loss": 0.0016,
      "num_tokens": 113367305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1650
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5503333333333333,
      "grad_norm": 2.175035263007885e-09,
      "kl": 0.042724609375,
      "learning_rate": 1e-05,
      "loss": 0.0017,
      "num_tokens": 113443529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1651
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5506666666666666,
      "grad_norm": 2.008985866552848e-09,
      "kl": 0.04693603515625,
      "learning_rate": 9.988364474278844e-06,
      "loss": 0.0019,
      "num_tokens": 113519529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1652
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.551,
      "grad_norm": 1.3901484408052056e-09,
      "kl": 0.04925537109375,
      "learning_rate": 9.976728964310499e-06,
      "loss": 0.002,
      "num_tokens": 113595225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1653
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5513333333333333,
      "grad_norm": 2.2641430952319297e-09,
      "kl": 0.0450439453125,
      "learning_rate": 9.965093485847766e-06,
      "loss": 0.0018,
      "num_tokens": 113670329.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1654
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5516666666666666,
      "grad_norm": 1.9120292016339135e-09,
      "kl": 0.0457763671875,
      "learning_rate": 9.953458054643389e-06,
      "loss": 0.0018,
      "num_tokens": 113747929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1655
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.552,
      "grad_norm": 1.233242730158679e-09,
      "kl": 0.0477294921875,
      "learning_rate": 9.941822686450061e-06,
      "loss": 0.0019,
      "num_tokens": 113820761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1656
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5523333333333333,
      "grad_norm": 3.649298863095396e-09,
      "kl": 0.048583984375,
      "learning_rate": 9.930187397020385e-06,
      "loss": 0.0019,
      "num_tokens": 113895401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1657
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5526666666666666,
      "grad_norm": 1.4731783570809398e-09,
      "kl": 0.04388427734375,
      "learning_rate": 9.918552202106853e-06,
      "loss": 0.0018,
      "num_tokens": 113969929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1658
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.553,
      "grad_norm": 1.9787802507664765e-09,
      "kl": 0.0460205078125,
      "learning_rate": 9.906917117461835e-06,
      "loss": 0.0018,
      "num_tokens": 114045129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1659
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5533333333333333,
      "grad_norm": 2.3373771806944887e-09,
      "kl": 0.0440673828125,
      "learning_rate": 9.895282158837545e-06,
      "loss": 0.0018,
      "num_tokens": 114121625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1660
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5536666666666666,
      "grad_norm": 3.3639102614557714e-09,
      "kl": 0.0445556640625,
      "learning_rate": 9.883647341986032e-06,
      "loss": 0.0018,
      "num_tokens": 114198777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1661
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.554,
      "grad_norm": 1.687276873774124e-09,
      "kl": 0.04205322265625,
      "learning_rate": 9.87201268265915e-06,
      "loss": 0.0017,
      "num_tokens": 114275129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1662
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5543333333333333,
      "grad_norm": 1.5919796592100965e-09,
      "kl": 0.04339599609375,
      "learning_rate": 9.860378196608549e-06,
      "loss": 0.0017,
      "num_tokens": 114351721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1663
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5546666666666666,
      "grad_norm": 1.410501271337239e-09,
      "kl": 0.04559326171875,
      "learning_rate": 9.848743899585628e-06,
      "loss": 0.0018,
      "num_tokens": 114424537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1664
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.555,
      "grad_norm": 2.9710793825188375e-09,
      "kl": 0.04156494140625,
      "learning_rate": 9.837109807341543e-06,
      "loss": 0.0017,
      "num_tokens": 114505321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1665
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5553333333333333,
      "grad_norm": 1.39505929031003e-09,
      "kl": 0.049072265625,
      "learning_rate": 9.825475935627165e-06,
      "loss": 0.002,
      "num_tokens": 114581977.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1666
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5556666666666666,
      "grad_norm": 1.3482273075737794e-09,
      "kl": 0.04498291015625,
      "learning_rate": 9.813842300193077e-06,
      "loss": 0.0018,
      "num_tokens": 114656361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1667
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.556,
      "grad_norm": 1.4014948090945722e-09,
      "kl": 0.04058837890625,
      "learning_rate": 9.802208916789528e-06,
      "loss": 0.0016,
      "num_tokens": 114732121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1668
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5563333333333333,
      "grad_norm": 1.881304223516622e-09,
      "kl": 0.04522705078125,
      "learning_rate": 9.790575801166432e-06,
      "loss": 0.0018,
      "num_tokens": 114806873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1669
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5566666666666666,
      "grad_norm": 1.7991323986166208e-09,
      "kl": 0.0462646484375,
      "learning_rate": 9.778942969073345e-06,
      "loss": 0.0019,
      "num_tokens": 114882825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1670
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.557,
      "grad_norm": 2.1693273843936822e-09,
      "kl": 0.04815673828125,
      "learning_rate": 9.767310436259438e-06,
      "loss": 0.0019,
      "num_tokens": 114958089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1671
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5573333333333333,
      "grad_norm": 2.684920952233938e-09,
      "kl": 0.04547119140625,
      "learning_rate": 9.75567821847347e-06,
      "loss": 0.0018,
      "num_tokens": 115035321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1672
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5576666666666666,
      "grad_norm": 1.7691628162452844e-09,
      "kl": 0.04571533203125,
      "learning_rate": 9.74404633146378e-06,
      "loss": 0.0018,
      "num_tokens": 115111257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1673
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.558,
      "grad_norm": 1.706522811950606e-09,
      "kl": 0.04486083984375,
      "learning_rate": 9.732414790978253e-06,
      "loss": 0.0018,
      "num_tokens": 115186505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1674
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5583333333333333,
      "grad_norm": 1.8742893903578306e-09,
      "kl": 0.04315185546875,
      "learning_rate": 9.720783612764314e-06,
      "loss": 0.0017,
      "num_tokens": 115264025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1675
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5586666666666666,
      "grad_norm": 1.583735365073835e-09,
      "kl": 0.04473876953125,
      "learning_rate": 9.709152812568886e-06,
      "loss": 0.0018,
      "num_tokens": 115342953.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1676
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.559,
      "grad_norm": 2.468940385469409e-09,
      "kl": 0.04864501953125,
      "learning_rate": 9.697522406138395e-06,
      "loss": 0.0019,
      "num_tokens": 115418441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1677
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5593333333333333,
      "grad_norm": 2.229818107934989e-09,
      "kl": 0.0491943359375,
      "learning_rate": 9.685892409218718e-06,
      "loss": 0.002,
      "num_tokens": 115495177.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1678
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5596666666666666,
      "grad_norm": 1.401665672418062e-09,
      "kl": 0.04278564453125,
      "learning_rate": 9.67426283755519e-06,
      "loss": 0.0017,
      "num_tokens": 115569433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1679
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.56,
      "grad_norm": 2.023365475167793e-09,
      "kl": 0.0465087890625,
      "learning_rate": 9.66263370689256e-06,
      "loss": 0.0019,
      "num_tokens": 115646969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1680
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5603333333333333,
      "grad_norm": 1.4352715682619532e-09,
      "kl": 0.0433349609375,
      "learning_rate": 9.651005032974994e-06,
      "loss": 0.0017,
      "num_tokens": 115722665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1681
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5606666666666666,
      "grad_norm": 1.5592030999656004e-09,
      "kl": 0.0430908203125,
      "learning_rate": 9.639376831546018e-06,
      "loss": 0.0017,
      "num_tokens": 115798297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1682
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.561,
      "grad_norm": 2.4742110582565147e-09,
      "kl": 0.04803466796875,
      "learning_rate": 9.627749118348541e-06,
      "loss": 0.0019,
      "num_tokens": 115876873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1683
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5613333333333334,
      "grad_norm": 1.3518681729607351e-09,
      "kl": 0.04620361328125,
      "learning_rate": 9.616121909124801e-06,
      "loss": 0.0018,
      "num_tokens": 115953561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1684
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5616666666666666,
      "grad_norm": 1.7428610776804021e-09,
      "kl": 0.045654296875,
      "learning_rate": 9.60449521961635e-06,
      "loss": 0.0018,
      "num_tokens": 116029593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1685
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.562,
      "grad_norm": 1.9048589372516744e-09,
      "kl": 0.0474853515625,
      "learning_rate": 9.592869065564043e-06,
      "loss": 0.0019,
      "num_tokens": 116104729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1686
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5623333333333334,
      "grad_norm": 2.7315361084134793e-09,
      "kl": 0.04876708984375,
      "learning_rate": 9.581243462708007e-06,
      "loss": 0.0019,
      "num_tokens": 116180073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1687
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5626666666666666,
      "grad_norm": 1.786708447859553e-09,
      "kl": 0.04278564453125,
      "learning_rate": 9.56961842678762e-06,
      "loss": 0.0017,
      "num_tokens": 116256553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1688
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.563,
      "grad_norm": 2.1482200462941137e-09,
      "kl": 0.04547119140625,
      "learning_rate": 9.557993973541494e-06,
      "loss": 0.0018,
      "num_tokens": 116332073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1689
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5633333333333334,
      "grad_norm": 1.6395352853137979e-09,
      "kl": 0.04632568359375,
      "learning_rate": 9.546370118707463e-06,
      "loss": 0.0019,
      "num_tokens": 116407705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1690
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5636666666666666,
      "grad_norm": 1.256568626928356e-09,
      "kl": 0.04833984375,
      "learning_rate": 9.534746878022533e-06,
      "loss": 0.0019,
      "num_tokens": 116482873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1691
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.564,
      "grad_norm": 1.0780000225807385e-09,
      "kl": 0.047119140625,
      "learning_rate": 9.523124267222894e-06,
      "loss": 0.0019,
      "num_tokens": 116558297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1692
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5643333333333334,
      "grad_norm": 1.355680456782693e-09,
      "kl": 0.04229736328125,
      "learning_rate": 9.511502302043867e-06,
      "loss": 0.0017,
      "num_tokens": 116633849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1693
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5646666666666667,
      "grad_norm": 2.220461370328053e-09,
      "kl": 0.046142578125,
      "learning_rate": 9.49988099821992e-06,
      "loss": 0.0018,
      "num_tokens": 116710953.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1694
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.565,
      "grad_norm": 1.2572445307057478e-09,
      "kl": 0.04290771484375,
      "learning_rate": 9.488260371484603e-06,
      "loss": 0.0017,
      "num_tokens": 116786665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1695
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5653333333333334,
      "grad_norm": 1.6089199972313395e-09,
      "kl": 0.0404052734375,
      "learning_rate": 9.476640437570562e-06,
      "loss": 0.0016,
      "num_tokens": 116863657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1696
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5656666666666667,
      "grad_norm": 1.5192305191646938e-09,
      "kl": 0.04718017578125,
      "learning_rate": 9.465021212209508e-06,
      "loss": 0.0019,
      "num_tokens": 116941193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1697
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.566,
      "grad_norm": 1.9401311668332255e-09,
      "kl": 0.04638671875,
      "learning_rate": 9.453402711132188e-06,
      "loss": 0.0019,
      "num_tokens": 117017929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1698
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5663333333333334,
      "grad_norm": 1.5091872196393297e-09,
      "kl": 0.0450439453125,
      "learning_rate": 9.441784950068362e-06,
      "loss": 0.0018,
      "num_tokens": 117093241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1699
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5666666666666667,
      "grad_norm": 1.7722497913652546e-09,
      "kl": 0.04705810546875,
      "learning_rate": 9.430167944746802e-06,
      "loss": 0.0019,
      "num_tokens": 117168201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1700
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.567,
      "grad_norm": 1.4675030080013585e-09,
      "kl": 0.04595947265625,
      "learning_rate": 9.418551710895243e-06,
      "loss": 0.0018,
      "num_tokens": 117243913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1701
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5673333333333334,
      "grad_norm": 2.2495594276250586e-09,
      "kl": 0.0452880859375,
      "learning_rate": 9.406936264240386e-06,
      "loss": 0.0018,
      "num_tokens": 117323785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1702
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5676666666666667,
      "grad_norm": 1.7817670672215513e-09,
      "kl": 0.05072021484375,
      "learning_rate": 9.395321620507857e-06,
      "loss": 0.002,
      "num_tokens": 117398953.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1703
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.568,
      "grad_norm": 1.2746972366528553e-09,
      "kl": 0.0411376953125,
      "learning_rate": 9.383707795422207e-06,
      "loss": 0.0016,
      "num_tokens": 117473657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1704
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5683333333333334,
      "grad_norm": 2.6153930132721825e-09,
      "kl": 0.0469970703125,
      "learning_rate": 9.372094804706867e-06,
      "loss": 0.0019,
      "num_tokens": 117553193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1705
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5686666666666667,
      "grad_norm": 1.7244937700056084e-09,
      "kl": 0.045654296875,
      "learning_rate": 9.360482664084144e-06,
      "loss": 0.0018,
      "num_tokens": 117626825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1706
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.569,
      "grad_norm": 1.7249683903486357e-09,
      "kl": 0.04754638671875,
      "learning_rate": 9.348871389275194e-06,
      "loss": 0.0019,
      "num_tokens": 117701561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1707
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5693333333333334,
      "grad_norm": 1.752768485907552e-09,
      "kl": 0.043701171875,
      "learning_rate": 9.337260996000002e-06,
      "loss": 0.0017,
      "num_tokens": 117778969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1708
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5696666666666667,
      "grad_norm": 1.8492066766739867e-09,
      "kl": 0.0479736328125,
      "learning_rate": 9.32565149997735e-06,
      "loss": 0.0019,
      "num_tokens": 117855001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1709
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.57,
      "grad_norm": 9.798014621864581e-10,
      "kl": 0.041748046875,
      "learning_rate": 9.314042916924816e-06,
      "loss": 0.0017,
      "num_tokens": 117928521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1710
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5703333333333334,
      "grad_norm": 1.470273680581613e-09,
      "kl": 0.04510498046875,
      "learning_rate": 9.302435262558748e-06,
      "loss": 0.0018,
      "num_tokens": 118010297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1711
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5706666666666667,
      "grad_norm": 7.561631321095774e-10,
      "kl": 0.0401611328125,
      "learning_rate": 9.290828552594218e-06,
      "loss": 0.0016,
      "num_tokens": 118085657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1712
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.571,
      "grad_norm": 1.917776604187793e-09,
      "kl": 0.04620361328125,
      "learning_rate": 9.279222802745028e-06,
      "loss": 0.0018,
      "num_tokens": 118162361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1713
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5713333333333334,
      "grad_norm": 1.3652730057600593e-09,
      "kl": 0.0450439453125,
      "learning_rate": 9.267618028723687e-06,
      "loss": 0.0018,
      "num_tokens": 118236521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1714
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5716666666666667,
      "grad_norm": 2.026567580415417e-09,
      "kl": 0.0458984375,
      "learning_rate": 9.256014246241369e-06,
      "loss": 0.0018,
      "num_tokens": 118313593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1715
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.572,
      "grad_norm": 2.4780502094756685e-09,
      "kl": 0.04376220703125,
      "learning_rate": 9.244411471007923e-06,
      "loss": 0.0017,
      "num_tokens": 118390665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1716
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5723333333333334,
      "grad_norm": 2.0237731490624356e-09,
      "kl": 0.045166015625,
      "learning_rate": 9.232809718731815e-06,
      "loss": 0.0018,
      "num_tokens": 118467769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1717
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5726666666666667,
      "grad_norm": 3.4158820216845243e-09,
      "kl": 0.044677734375,
      "learning_rate": 9.221209005120142e-06,
      "loss": 0.0018,
      "num_tokens": 118544777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1718
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.573,
      "grad_norm": 1.8960197856188188e-09,
      "kl": 0.04254150390625,
      "learning_rate": 9.20960934587859e-06,
      "loss": 0.0017,
      "num_tokens": 118618729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1719
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5733333333333334,
      "grad_norm": 2.2596837734312203e-09,
      "kl": 0.0474853515625,
      "learning_rate": 9.198010756711413e-06,
      "loss": 0.0019,
      "num_tokens": 118694729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1720
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5736666666666667,
      "grad_norm": 1.983386788140251e-09,
      "kl": 0.04742431640625,
      "learning_rate": 9.18641325332142e-06,
      "loss": 0.0019,
      "num_tokens": 118770505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1721
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.574,
      "grad_norm": 2.21807083811143e-09,
      "kl": 0.04571533203125,
      "learning_rate": 9.174816851409949e-06,
      "loss": 0.0018,
      "num_tokens": 118847865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1722
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5743333333333334,
      "grad_norm": 1.524444015466031e-09,
      "kl": 0.04681396484375,
      "learning_rate": 9.163221566676847e-06,
      "loss": 0.0019,
      "num_tokens": 118922073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1723
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5746666666666667,
      "grad_norm": 2.259560760720092e-09,
      "kl": 0.0443115234375,
      "learning_rate": 9.151627414820448e-06,
      "loss": 0.0018,
      "num_tokens": 118999673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1724
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.575,
      "grad_norm": 1.4119426738901097e-09,
      "kl": 0.04541015625,
      "learning_rate": 9.140034411537558e-06,
      "loss": 0.0018,
      "num_tokens": 119074793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1725
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5753333333333334,
      "grad_norm": 1.0345949652545983e-09,
      "kl": 0.04437255859375,
      "learning_rate": 9.128442572523418e-06,
      "loss": 0.0018,
      "num_tokens": 119149401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1726
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5756666666666667,
      "grad_norm": 2.1159862750863567e-09,
      "kl": 0.04351806640625,
      "learning_rate": 9.116851913471701e-06,
      "loss": 0.0017,
      "num_tokens": 119226537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1727
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.576,
      "grad_norm": 1.1043195247140147e-09,
      "kl": 0.04815673828125,
      "learning_rate": 9.105262450074479e-06,
      "loss": 0.0019,
      "num_tokens": 119300297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1728
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5763333333333334,
      "grad_norm": 3.860205044503573e-09,
      "kl": 0.04339599609375,
      "learning_rate": 9.093674198022201e-06,
      "loss": 0.0017,
      "num_tokens": 119378281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1729
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5766666666666667,
      "grad_norm": 1.656429327034914e-09,
      "kl": 0.04425048828125,
      "learning_rate": 9.082087173003686e-06,
      "loss": 0.0018,
      "num_tokens": 119457481.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1730
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.577,
      "grad_norm": 1.4837566730818708e-09,
      "kl": 0.04608154296875,
      "learning_rate": 9.07050139070608e-06,
      "loss": 0.0018,
      "num_tokens": 119533849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1731
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5773333333333334,
      "grad_norm": 2.0708159631510625e-09,
      "kl": 0.04998779296875,
      "learning_rate": 9.058916866814857e-06,
      "loss": 0.002,
      "num_tokens": 119611977.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1732
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5776666666666667,
      "grad_norm": 1.5584373791455164e-09,
      "kl": 0.0472412109375,
      "learning_rate": 9.047333617013786e-06,
      "loss": 0.0019,
      "num_tokens": 119687721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1733
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.578,
      "grad_norm": 1.3733874038024396e-09,
      "kl": 0.0438232421875,
      "learning_rate": 9.035751656984904e-06,
      "loss": 0.0018,
      "num_tokens": 119762457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1734
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5783333333333334,
      "grad_norm": 1.005605265724796e-09,
      "kl": 0.0450439453125,
      "learning_rate": 9.024171002408507e-06,
      "loss": 0.0018,
      "num_tokens": 119836329.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1735
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5786666666666667,
      "grad_norm": 1.3734121617758888e-09,
      "kl": 0.044921875,
      "learning_rate": 9.012591668963123e-06,
      "loss": 0.0018,
      "num_tokens": 119911961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1736
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.579,
      "grad_norm": 1.5128538422004567e-09,
      "kl": 0.0447998046875,
      "learning_rate": 9.001013672325491e-06,
      "loss": 0.0018,
      "num_tokens": 119987497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1737
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5793333333333334,
      "grad_norm": 1.5829820787516269e-09,
      "kl": 0.04693603515625,
      "learning_rate": 8.989437028170537e-06,
      "loss": 0.0019,
      "num_tokens": 120061929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1738
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5796666666666667,
      "grad_norm": 1.4337884213233565e-09,
      "kl": 0.044677734375,
      "learning_rate": 8.977861752171365e-06,
      "loss": 0.0018,
      "num_tokens": 120136617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1739
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.58,
      "grad_norm": 1.5615622128706264e-09,
      "kl": 0.046630859375,
      "learning_rate": 8.966287859999216e-06,
      "loss": 0.0019,
      "num_tokens": 120212265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1740
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5803333333333334,
      "grad_norm": 2.503499185735336e-09,
      "kl": 0.046630859375,
      "learning_rate": 8.954715367323468e-06,
      "loss": 0.0019,
      "num_tokens": 120291945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1741
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5806666666666667,
      "grad_norm": 1.8234164178565493e-09,
      "kl": 0.04229736328125,
      "learning_rate": 8.94314428981159e-06,
      "loss": 0.0017,
      "num_tokens": 120367161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1742
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.581,
      "grad_norm": 1.9290238295610607e-09,
      "kl": 0.0447998046875,
      "learning_rate": 8.931574643129152e-06,
      "loss": 0.0018,
      "num_tokens": 120443449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1743
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5813333333333334,
      "grad_norm": 2.931271003703273e-09,
      "kl": 0.04437255859375,
      "learning_rate": 8.920006442939772e-06,
      "loss": 0.0018,
      "num_tokens": 120520713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1744
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5816666666666667,
      "grad_norm": 2.500268436733677e-09,
      "kl": 0.047119140625,
      "learning_rate": 8.90843970490512e-06,
      "loss": 0.0019,
      "num_tokens": 120596841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1745
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.582,
      "grad_norm": 1.2232546087176388e-09,
      "kl": 0.04705810546875,
      "learning_rate": 8.896874444684882e-06,
      "loss": 0.0019,
      "num_tokens": 120671241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1746
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5823333333333334,
      "grad_norm": 2.2328690008066587e-09,
      "kl": 0.04608154296875,
      "learning_rate": 8.885310677936746e-06,
      "loss": 0.0018,
      "num_tokens": 120747785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1747
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5826666666666667,
      "grad_norm": 1.1094360985453022e-09,
      "kl": 0.0472412109375,
      "learning_rate": 8.873748420316372e-06,
      "loss": 0.0019,
      "num_tokens": 120822409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1748
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.583,
      "grad_norm": 1.3191042702587197e-09,
      "kl": 0.045654296875,
      "learning_rate": 8.862187687477386e-06,
      "loss": 0.0018,
      "num_tokens": 120896297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1749
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5833333333333334,
      "grad_norm": 2.8086502013024983e-09,
      "kl": 0.046875,
      "learning_rate": 8.850628495071336e-06,
      "loss": 0.0019,
      "num_tokens": 120972905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1750
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5836666666666667,
      "grad_norm": 1.377590375106763e-09,
      "kl": 0.0452880859375,
      "learning_rate": 8.839070858747697e-06,
      "loss": 0.0018,
      "num_tokens": 121047689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1751
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.584,
      "grad_norm": 2.4442166068894267e-09,
      "kl": 0.04412841796875,
      "learning_rate": 8.827514794153839e-06,
      "loss": 0.0018,
      "num_tokens": 121124201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1752
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5843333333333334,
      "grad_norm": 2.2935537913326698e-09,
      "kl": 0.0465087890625,
      "learning_rate": 8.815960316934991e-06,
      "loss": 0.0019,
      "num_tokens": 121200569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1753
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5846666666666667,
      "grad_norm": 1.6534378310950615e-09,
      "kl": 0.0440673828125,
      "learning_rate": 8.804407442734244e-06,
      "loss": 0.0018,
      "num_tokens": 121275033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1754
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.585,
      "grad_norm": 1.8148305080956106e-09,
      "kl": 0.0491943359375,
      "learning_rate": 8.792856187192516e-06,
      "loss": 0.002,
      "num_tokens": 121351513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1755
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5853333333333334,
      "grad_norm": 1.8299756154860347e-09,
      "kl": 0.04730224609375,
      "learning_rate": 8.781306565948528e-06,
      "loss": 0.0019,
      "num_tokens": 121431913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1756
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5856666666666667,
      "grad_norm": 1.563575824370389e-09,
      "kl": 0.04473876953125,
      "learning_rate": 8.769758594638796e-06,
      "loss": 0.0018,
      "num_tokens": 121507049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1757
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.586,
      "grad_norm": 1.4938239534245668e-09,
      "kl": 0.0465087890625,
      "learning_rate": 8.758212288897597e-06,
      "loss": 0.0019,
      "num_tokens": 121581257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1758
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5863333333333334,
      "grad_norm": 1.427981954904567e-09,
      "kl": 0.04595947265625,
      "learning_rate": 8.746667664356957e-06,
      "loss": 0.0018,
      "num_tokens": 121655081.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1759
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5866666666666667,
      "grad_norm": 1.3076499882913595e-09,
      "kl": 0.04791259765625,
      "learning_rate": 8.735124736646627e-06,
      "loss": 0.0019,
      "num_tokens": 121729449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1760
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.587,
      "grad_norm": 2.683783417722907e-09,
      "kl": 0.04833984375,
      "learning_rate": 8.723583521394054e-06,
      "loss": 0.0019,
      "num_tokens": 121806617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1761
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5873333333333334,
      "grad_norm": 1.2446469410676286e-09,
      "kl": 0.04498291015625,
      "learning_rate": 8.712044034224374e-06,
      "loss": 0.0018,
      "num_tokens": 121885305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1762
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5876666666666667,
      "grad_norm": 2.503556251198802e-09,
      "kl": 0.043701171875,
      "learning_rate": 8.700506290760377e-06,
      "loss": 0.0017,
      "num_tokens": 121966905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1763
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.588,
      "grad_norm": 2.325371895040007e-09,
      "kl": 0.04705810546875,
      "learning_rate": 8.688970306622494e-06,
      "loss": 0.0019,
      "num_tokens": 122043385.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1764
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5883333333333334,
      "grad_norm": 1.0238431213949184e-09,
      "kl": 0.04150390625,
      "learning_rate": 8.677436097428775e-06,
      "loss": 0.0017,
      "num_tokens": 122118601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1765
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5886666666666667,
      "grad_norm": 2.0231400998937943e-09,
      "kl": 0.04132080078125,
      "learning_rate": 8.665903678794873e-06,
      "loss": 0.0017,
      "num_tokens": 122194249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1766
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.589,
      "grad_norm": 1.476762934160547e-09,
      "kl": 0.0467529296875,
      "learning_rate": 8.654373066334007e-06,
      "loss": 0.0019,
      "num_tokens": 122269257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1767
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5893333333333334,
      "grad_norm": 1.092842816241557e-09,
      "kl": 0.04498291015625,
      "learning_rate": 8.642844275656957e-06,
      "loss": 0.0018,
      "num_tokens": 122343177.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1768
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5896666666666667,
      "grad_norm": 1.4508390044909447e-09,
      "kl": 0.04534912109375,
      "learning_rate": 8.631317322372032e-06,
      "loss": 0.0018,
      "num_tokens": 122418233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1769
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.59,
      "grad_norm": 2.323455650099504e-09,
      "kl": 0.04827880859375,
      "learning_rate": 8.619792222085059e-06,
      "loss": 0.0019,
      "num_tokens": 122497529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1770
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5903333333333334,
      "grad_norm": 3.2626954471481895e-09,
      "kl": 0.04376220703125,
      "learning_rate": 8.60826899039935e-06,
      "loss": 0.0018,
      "num_tokens": 122575161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1771
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5906666666666667,
      "grad_norm": 1.5555665644484407e-09,
      "kl": 0.04730224609375,
      "learning_rate": 8.596747642915687e-06,
      "loss": 0.0019,
      "num_tokens": 122649737.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1772
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.591,
      "grad_norm": 3.3488369854950406e-09,
      "kl": 0.0482177734375,
      "learning_rate": 8.585228195232311e-06,
      "loss": 0.0019,
      "num_tokens": 122730745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1773
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5913333333333334,
      "grad_norm": 1.7979397970435684e-09,
      "kl": 0.04443359375,
      "learning_rate": 8.573710662944884e-06,
      "loss": 0.0018,
      "num_tokens": 122806905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1774
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5916666666666667,
      "grad_norm": 1.7654516737408699e-09,
      "kl": 0.04681396484375,
      "learning_rate": 8.562195061646474e-06,
      "loss": 0.0019,
      "num_tokens": 122881913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1775
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.592,
      "grad_norm": 1.456313403203069e-09,
      "kl": 0.04364013671875,
      "learning_rate": 8.550681406927534e-06,
      "loss": 0.0017,
      "num_tokens": 122960329.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1776
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5923333333333334,
      "grad_norm": 2.5032791395318554e-09,
      "kl": 0.04376220703125,
      "learning_rate": 8.539169714375885e-06,
      "loss": 0.0018,
      "num_tokens": 123038377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1777
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5926666666666667,
      "grad_norm": 1.246847958213948e-09,
      "kl": 0.04296875,
      "learning_rate": 8.527659999576692e-06,
      "loss": 0.0017,
      "num_tokens": 123111401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1778
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.593,
      "grad_norm": 1.3495038420074934e-09,
      "kl": 0.04522705078125,
      "learning_rate": 8.516152278112433e-06,
      "loss": 0.0018,
      "num_tokens": 123184489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1779
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5933333333333334,
      "grad_norm": 1.646224490059467e-09,
      "kl": 0.0428466796875,
      "learning_rate": 8.504646565562907e-06,
      "loss": 0.0017,
      "num_tokens": 123259769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1780
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5936666666666667,
      "grad_norm": 1.183572906349184e-09,
      "kl": 0.04351806640625,
      "learning_rate": 8.49314287750517e-06,
      "loss": 0.0017,
      "num_tokens": 123333769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1781
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.594,
      "grad_norm": 1.576472952180552e-09,
      "kl": 0.0496826171875,
      "learning_rate": 8.481641229513554e-06,
      "loss": 0.002,
      "num_tokens": 123409081.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1782
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5943333333333334,
      "grad_norm": 1.1496690266454834e-09,
      "kl": 0.045166015625,
      "learning_rate": 8.47014163715962e-06,
      "loss": 0.0018,
      "num_tokens": 123484185.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1783
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5946666666666667,
      "grad_norm": 1.5892974714049046e-09,
      "kl": 0.04510498046875,
      "learning_rate": 8.458644116012154e-06,
      "loss": 0.0018,
      "num_tokens": 123560585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1784
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.595,
      "grad_norm": 9.45226785731279e-10,
      "kl": 0.0469970703125,
      "learning_rate": 8.447148681637127e-06,
      "loss": 0.0019,
      "num_tokens": 123638345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1785
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5953333333333334,
      "grad_norm": 1.885247069566276e-09,
      "kl": 0.04534912109375,
      "learning_rate": 8.43565534959769e-06,
      "loss": 0.0018,
      "num_tokens": 123717977.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1786
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5956666666666667,
      "grad_norm": 3.654695213128889e-09,
      "kl": 0.04449462890625,
      "learning_rate": 8.424164135454158e-06,
      "loss": 0.0018,
      "num_tokens": 123796473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1787
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.596,
      "grad_norm": 1.504958047071625e-09,
      "kl": 0.04742431640625,
      "learning_rate": 8.412675054763963e-06,
      "loss": 0.0019,
      "num_tokens": 123872249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1788
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5963333333333334,
      "grad_norm": 1.5198285963080593e-09,
      "kl": 0.0443115234375,
      "learning_rate": 8.401188123081653e-06,
      "loss": 0.0018,
      "num_tokens": 123948313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1789
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5966666666666667,
      "grad_norm": 1.829009499410006e-09,
      "kl": 0.04327392578125,
      "learning_rate": 8.389703355958873e-06,
      "loss": 0.0017,
      "num_tokens": 124025497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1790
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.597,
      "grad_norm": 1.350954570433771e-09,
      "kl": 0.04266357421875,
      "learning_rate": 8.378220768944328e-06,
      "loss": 0.0017,
      "num_tokens": 124101001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1791
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5973333333333334,
      "grad_norm": 1.2985260644526875e-09,
      "kl": 0.0472412109375,
      "learning_rate": 8.366740377583781e-06,
      "loss": 0.0019,
      "num_tokens": 124176025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1792
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5976666666666667,
      "grad_norm": 1.8538298673931308e-09,
      "kl": 0.04705810546875,
      "learning_rate": 8.355262197420011e-06,
      "loss": 0.0019,
      "num_tokens": 124252649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1793
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.598,
      "grad_norm": 1.965329010644723e-09,
      "kl": 0.044921875,
      "learning_rate": 8.343786243992819e-06,
      "loss": 0.0018,
      "num_tokens": 124330009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1794
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5983333333333334,
      "grad_norm": 1.3647816210493602e-09,
      "kl": 0.047607421875,
      "learning_rate": 8.332312532838978e-06,
      "loss": 0.0019,
      "num_tokens": 124404713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1795
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5986666666666667,
      "grad_norm": 1.900713586522329e-09,
      "kl": 0.04595947265625,
      "learning_rate": 8.32084107949223e-06,
      "loss": 0.0018,
      "num_tokens": 124481417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1796
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.599,
      "grad_norm": 1.3123166997530689e-09,
      "kl": 0.04302978515625,
      "learning_rate": 8.309371899483261e-06,
      "loss": 0.0017,
      "num_tokens": 124559721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1797
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5993333333333334,
      "grad_norm": 1.8150818625883858e-09,
      "kl": 0.04449462890625,
      "learning_rate": 8.297905008339677e-06,
      "loss": 0.0018,
      "num_tokens": 124639881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1798
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.5996666666666667,
      "grad_norm": 2.8555537934238373e-09,
      "kl": 0.04437255859375,
      "learning_rate": 8.286440421585986e-06,
      "loss": 0.0018,
      "num_tokens": 124717641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1799
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6,
      "grad_norm": 1.5675584164043244e-09,
      "kl": 0.04486083984375,
      "learning_rate": 8.274978154743574e-06,
      "loss": 0.0018,
      "num_tokens": 124793545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1800
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6003333333333334,
      "grad_norm": 1.7557897358244645e-09,
      "kl": 0.0474853515625,
      "learning_rate": 8.263518223330698e-06,
      "loss": 0.0019,
      "num_tokens": 124866665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1801
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6006666666666667,
      "grad_norm": 1.0279787021616471e-09,
      "kl": 0.04608154296875,
      "learning_rate": 8.252060642862436e-06,
      "loss": 0.0018,
      "num_tokens": 124941545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1802
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.601,
      "grad_norm": 1.3686799471557265e-09,
      "kl": 0.04156494140625,
      "learning_rate": 8.240605428850693e-06,
      "loss": 0.0017,
      "num_tokens": 125015417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1803
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6013333333333334,
      "grad_norm": 2.2302211188929277e-09,
      "kl": 0.04486083984375,
      "learning_rate": 8.22915259680417e-06,
      "loss": 0.0018,
      "num_tokens": 125093321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1804
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6016666666666667,
      "grad_norm": 1.1721226211847124e-09,
      "kl": 0.0477294921875,
      "learning_rate": 8.217702162228337e-06,
      "loss": 0.0019,
      "num_tokens": 125167913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1805
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.602,
      "grad_norm": 1.4239555090611589e-09,
      "kl": 0.044921875,
      "learning_rate": 8.206254140625425e-06,
      "loss": 0.0018,
      "num_tokens": 125241913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1806
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6023333333333334,
      "grad_norm": 1.7920175343633105e-09,
      "kl": 0.04705810546875,
      "learning_rate": 8.194808547494401e-06,
      "loss": 0.0019,
      "num_tokens": 125319049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1807
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6026666666666667,
      "grad_norm": 1.508231206592825e-09,
      "kl": 0.04571533203125,
      "learning_rate": 8.183365398330931e-06,
      "loss": 0.0018,
      "num_tokens": 125394249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1808
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.603,
      "grad_norm": 1.424103390768039e-09,
      "kl": 0.04217529296875,
      "learning_rate": 8.171924708627387e-06,
      "loss": 0.0017,
      "num_tokens": 125468841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1809
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6033333333333334,
      "grad_norm": 1.5848075074487156e-09,
      "kl": 0.0433349609375,
      "learning_rate": 8.1604864938728e-06,
      "loss": 0.0017,
      "num_tokens": 125545161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1810
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6036666666666667,
      "grad_norm": 1.7607113544926278e-09,
      "kl": 0.04730224609375,
      "learning_rate": 8.149050769552856e-06,
      "loss": 0.0019,
      "num_tokens": 125622601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1811
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.604,
      "grad_norm": 1.8087880082617858e-09,
      "kl": 0.04443359375,
      "learning_rate": 8.137617551149868e-06,
      "loss": 0.0018,
      "num_tokens": 125697353.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1812
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6043333333333333,
      "grad_norm": 1.837575092089594e-09,
      "kl": 0.0455322265625,
      "learning_rate": 8.126186854142752e-06,
      "loss": 0.0018,
      "num_tokens": 125772217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1813
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6046666666666667,
      "grad_norm": 1.5220485982680998e-09,
      "kl": 0.04681396484375,
      "learning_rate": 8.114758694007025e-06,
      "loss": 0.0019,
      "num_tokens": 125846761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1814
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.605,
      "grad_norm": 1.5049265167377257e-09,
      "kl": 0.04681396484375,
      "learning_rate": 8.103333086214753e-06,
      "loss": 0.0019,
      "num_tokens": 125921257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1815
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6053333333333333,
      "grad_norm": 2.5190234342886697e-09,
      "kl": 0.040771484375,
      "learning_rate": 8.091910046234552e-06,
      "loss": 0.0016,
      "num_tokens": 125997433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1816
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6056666666666667,
      "grad_norm": 2.1337900335538507e-09,
      "kl": 0.04290771484375,
      "learning_rate": 8.080489589531567e-06,
      "loss": 0.0017,
      "num_tokens": 126072617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1817
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.606,
      "grad_norm": 1.868444732266994e-09,
      "kl": 0.0439453125,
      "learning_rate": 8.069071731567435e-06,
      "loss": 0.0018,
      "num_tokens": 126148425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1818
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6063333333333333,
      "grad_norm": 1.7837000765297262e-09,
      "kl": 0.04864501953125,
      "learning_rate": 8.057656487800283e-06,
      "loss": 0.0019,
      "num_tokens": 126224105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1819
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6066666666666667,
      "grad_norm": 3.08671643800551e-09,
      "kl": 0.04498291015625,
      "learning_rate": 8.046243873684694e-06,
      "loss": 0.0018,
      "num_tokens": 126303673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1820
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.607,
      "grad_norm": 1.7429819809677838e-09,
      "kl": 0.0474853515625,
      "learning_rate": 8.034833904671698e-06,
      "loss": 0.0019,
      "num_tokens": 126379961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1821
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6073333333333333,
      "grad_norm": 1.6735277608148635e-09,
      "kl": 0.04669189453125,
      "learning_rate": 8.023426596208739e-06,
      "loss": 0.0019,
      "num_tokens": 126455833.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1822
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6076666666666667,
      "grad_norm": 1.5251878648925299e-09,
      "kl": 0.040283203125,
      "learning_rate": 8.012021963739659e-06,
      "loss": 0.0016,
      "num_tokens": 126531689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1823
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.608,
      "grad_norm": 1.3055631020719716e-09,
      "kl": 0.044189453125,
      "learning_rate": 8.00062002270467e-06,
      "loss": 0.0018,
      "num_tokens": 126606649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1824
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6083333333333333,
      "grad_norm": 1.7147958608632052e-09,
      "kl": 0.0455322265625,
      "learning_rate": 7.989220788540356e-06,
      "loss": 0.0018,
      "num_tokens": 126682169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1825
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6086666666666667,
      "grad_norm": 3.0053066701896114e-09,
      "kl": 0.04681396484375,
      "learning_rate": 7.977824276679623e-06,
      "loss": 0.0019,
      "num_tokens": 126759289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1826
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.609,
      "grad_norm": 2.357055217672155e-09,
      "kl": 0.0479736328125,
      "learning_rate": 7.966430502551694e-06,
      "loss": 0.0019,
      "num_tokens": 126834169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1827
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6093333333333333,
      "grad_norm": 1.5345462678340027e-09,
      "kl": 0.04608154296875,
      "learning_rate": 7.955039481582098e-06,
      "loss": 0.0018,
      "num_tokens": 126907161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1828
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6096666666666667,
      "grad_norm": 2.952116107124425e-09,
      "kl": 0.04644775390625,
      "learning_rate": 7.943651229192615e-06,
      "loss": 0.0019,
      "num_tokens": 126981897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1829
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.61,
      "grad_norm": 1.2255934045413142e-09,
      "kl": 0.04754638671875,
      "learning_rate": 7.932265760801295e-06,
      "loss": 0.0019,
      "num_tokens": 127057625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1830
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6103333333333333,
      "grad_norm": 2.1801340732707786e-09,
      "kl": 0.0469970703125,
      "learning_rate": 7.92088309182241e-06,
      "loss": 0.0019,
      "num_tokens": 127132857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1831
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6106666666666667,
      "grad_norm": 1.4415745264173552e-09,
      "kl": 0.0418701171875,
      "learning_rate": 7.90950323766644e-06,
      "loss": 0.0017,
      "num_tokens": 127210681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1832
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.611,
      "grad_norm": 1.4148440197203627e-09,
      "kl": 0.0462646484375,
      "learning_rate": 7.898126213740063e-06,
      "loss": 0.0018,
      "num_tokens": 127285913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1833
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6113333333333333,
      "grad_norm": 1.5988915746945054e-09,
      "kl": 0.0452880859375,
      "learning_rate": 7.886752035446116e-06,
      "loss": 0.0018,
      "num_tokens": 127361609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1834
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6116666666666667,
      "grad_norm": 1.6123699042580597e-09,
      "kl": 0.04473876953125,
      "learning_rate": 7.875380718183589e-06,
      "loss": 0.0018,
      "num_tokens": 127436057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1835
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.612,
      "grad_norm": 1.6639797317807847e-09,
      "kl": 0.0445556640625,
      "learning_rate": 7.864012277347602e-06,
      "loss": 0.0018,
      "num_tokens": 127511193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1836
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6123333333333333,
      "grad_norm": 9.107178344791578e-10,
      "kl": 0.04400634765625,
      "learning_rate": 7.852646728329368e-06,
      "loss": 0.0018,
      "num_tokens": 127586009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1837
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6126666666666667,
      "grad_norm": 1.712260999653381e-09,
      "kl": 0.04571533203125,
      "learning_rate": 7.841284086516201e-06,
      "loss": 0.0018,
      "num_tokens": 127661241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1838
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.613,
      "grad_norm": 2.2920261244507856e-09,
      "kl": 0.04791259765625,
      "learning_rate": 7.829924367291467e-06,
      "loss": 0.0019,
      "num_tokens": 127739785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1839
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6133333333333333,
      "grad_norm": 2.4824513555898875e-09,
      "kl": 0.04705810546875,
      "learning_rate": 7.818567586034578e-06,
      "loss": 0.0019,
      "num_tokens": 127816409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1840
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6136666666666667,
      "grad_norm": 1.1884033757070256e-09,
      "kl": 0.046142578125,
      "learning_rate": 7.807213758120965e-06,
      "loss": 0.0018,
      "num_tokens": 127891449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1841
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.614,
      "grad_norm": 1.6770269617438771e-09,
      "kl": 0.04638671875,
      "learning_rate": 7.79586289892208e-06,
      "loss": 0.0019,
      "num_tokens": 127966569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1842
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6143333333333333,
      "grad_norm": 2.0313823956286114e-09,
      "kl": 0.04803466796875,
      "learning_rate": 7.784515023805328e-06,
      "loss": 0.0019,
      "num_tokens": 128042633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1843
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6146666666666667,
      "grad_norm": 1.2903884627490925e-09,
      "kl": 0.04656982421875,
      "learning_rate": 7.773170148134092e-06,
      "loss": 0.0019,
      "num_tokens": 128121481.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1844
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.615,
      "grad_norm": 1.517370007420027e-09,
      "kl": 0.04510498046875,
      "learning_rate": 7.761828287267688e-06,
      "loss": 0.0018,
      "num_tokens": 128197145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1845
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6153333333333333,
      "grad_norm": 9.97361304655442e-10,
      "kl": 0.03985595703125,
      "learning_rate": 7.750489456561351e-06,
      "loss": 0.0016,
      "num_tokens": 128272041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1846
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6156666666666667,
      "grad_norm": 1.296184604093753e-09,
      "kl": 0.04620361328125,
      "learning_rate": 7.739153671366219e-06,
      "loss": 0.0018,
      "num_tokens": 128347721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1847
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.616,
      "grad_norm": 1.082241185557109e-09,
      "kl": 0.04681396484375,
      "learning_rate": 7.727820947029289e-06,
      "loss": 0.0019,
      "num_tokens": 128421561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1848
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6163333333333333,
      "grad_norm": 2.2342969696609316e-09,
      "kl": 0.0474853515625,
      "learning_rate": 7.716491298893443e-06,
      "loss": 0.0019,
      "num_tokens": 128497817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1849
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6166666666666667,
      "grad_norm": 3.3846692115702126e-09,
      "kl": 0.046630859375,
      "learning_rate": 7.705164742297376e-06,
      "loss": 0.0019,
      "num_tokens": 128573609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1850
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.617,
      "grad_norm": 2.08920103439425e-09,
      "kl": 0.04498291015625,
      "learning_rate": 7.6938412925756e-06,
      "loss": 0.0018,
      "num_tokens": 128649433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1851
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6173333333333333,
      "grad_norm": 1.6775133504509654e-09,
      "kl": 0.0477294921875,
      "learning_rate": 7.68252096505843e-06,
      "loss": 0.0019,
      "num_tokens": 128724537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1852
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6176666666666667,
      "grad_norm": 8.972145804087006e-10,
      "kl": 0.0430908203125,
      "learning_rate": 7.671203775071942e-06,
      "loss": 0.0017,
      "num_tokens": 128798953.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1853
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.618,
      "grad_norm": 1.4025647310234035e-09,
      "kl": 0.04742431640625,
      "learning_rate": 7.65988973793798e-06,
      "loss": 0.0019,
      "num_tokens": 128873017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1854
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6183333333333333,
      "grad_norm": 2.208160987393626e-09,
      "kl": 0.04425048828125,
      "learning_rate": 7.6485788689741e-06,
      "loss": 0.0018,
      "num_tokens": 128949385.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1855
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6186666666666667,
      "grad_norm": 1.7150909581431506e-09,
      "kl": 0.04547119140625,
      "learning_rate": 7.637271183493587e-06,
      "loss": 0.0018,
      "num_tokens": 129022089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1856
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.619,
      "grad_norm": 2.241152818882597e-09,
      "kl": 0.047119140625,
      "learning_rate": 7.625966696805406e-06,
      "loss": 0.0019,
      "num_tokens": 129098265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1857
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6193333333333333,
      "grad_norm": 1.4086324329198874e-09,
      "kl": 0.0438232421875,
      "learning_rate": 7.6146654242141935e-06,
      "loss": 0.0018,
      "num_tokens": 129172265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1858
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6196666666666667,
      "grad_norm": 1.644347658036338e-09,
      "kl": 0.0435791015625,
      "learning_rate": 7.6033673810202314e-06,
      "loss": 0.0017,
      "num_tokens": 129249273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1859
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.62,
      "grad_norm": 1.4028938011279024e-09,
      "kl": 0.04364013671875,
      "learning_rate": 7.592072582519437e-06,
      "loss": 0.0017,
      "num_tokens": 129323673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1860
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6203333333333333,
      "grad_norm": 1.7844444810677373e-09,
      "kl": 0.0482177734375,
      "learning_rate": 7.580781044003324e-06,
      "loss": 0.0019,
      "num_tokens": 129400185.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1861
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6206666666666667,
      "grad_norm": 1.3033287782349134e-09,
      "kl": 0.04364013671875,
      "learning_rate": 7.569492780759002e-06,
      "loss": 0.0017,
      "num_tokens": 129474953.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1862
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.621,
      "grad_norm": 1.7599040003091204e-09,
      "kl": 0.04864501953125,
      "learning_rate": 7.558207808069149e-06,
      "loss": 0.0019,
      "num_tokens": 129556601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1863
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6213333333333333,
      "grad_norm": 2.2806734278901786e-09,
      "kl": 0.04412841796875,
      "learning_rate": 7.546926141211975e-06,
      "loss": 0.0018,
      "num_tokens": 129639817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1864
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6216666666666667,
      "grad_norm": 1.3473536730757019e-09,
      "kl": 0.04290771484375,
      "learning_rate": 7.535647795461224e-06,
      "loss": 0.0017,
      "num_tokens": 129714633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1865
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.622,
      "grad_norm": 2.456862047139907e-09,
      "kl": 0.04571533203125,
      "learning_rate": 7.524372786086143e-06,
      "loss": 0.0018,
      "num_tokens": 129791273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1866
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6223333333333333,
      "grad_norm": 1.1364569285632342e-09,
      "kl": 0.04510498046875,
      "learning_rate": 7.513101128351454e-06,
      "loss": 0.0018,
      "num_tokens": 129865561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1867
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6226666666666667,
      "grad_norm": 2.0032744352249665e-09,
      "kl": 0.04473876953125,
      "learning_rate": 7.501832837517351e-06,
      "loss": 0.0018,
      "num_tokens": 129941689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1868
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.623,
      "grad_norm": 1.5712428025338454e-09,
      "kl": 0.04443359375,
      "learning_rate": 7.490567928839472e-06,
      "loss": 0.0018,
      "num_tokens": 130016105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1869
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6233333333333333,
      "grad_norm": 3.05246272702675e-09,
      "kl": 0.04888916015625,
      "learning_rate": 7.4793064175688635e-06,
      "loss": 0.002,
      "num_tokens": 130094201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1870
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6236666666666667,
      "grad_norm": 2.1661195059863303e-09,
      "kl": 0.04693603515625,
      "learning_rate": 7.468048318951983e-06,
      "loss": 0.0019,
      "num_tokens": 130168665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1871
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.624,
      "grad_norm": 4.371242923184582e-09,
      "kl": 0.0457763671875,
      "learning_rate": 7.4567936482306625e-06,
      "loss": 0.0018,
      "num_tokens": 130247961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1872
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6243333333333333,
      "grad_norm": 1.803898919128244e-09,
      "kl": 0.04083251953125,
      "learning_rate": 7.445542420642097e-06,
      "loss": 0.0016,
      "num_tokens": 130324073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1873
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6246666666666667,
      "grad_norm": 1.482397871122032e-09,
      "kl": 0.04571533203125,
      "learning_rate": 7.434294651418815e-06,
      "loss": 0.0018,
      "num_tokens": 130400249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1874
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.625,
      "grad_norm": 1.830876561470518e-09,
      "kl": 0.0418701171875,
      "learning_rate": 7.423050355788663e-06,
      "loss": 0.0017,
      "num_tokens": 130472985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1875
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6253333333333333,
      "grad_norm": 2.9684679159203142e-09,
      "kl": 0.04840087890625,
      "learning_rate": 7.411809548974792e-06,
      "loss": 0.0019,
      "num_tokens": 130549097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1876
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6256666666666667,
      "grad_norm": 1.456159304247251e-09,
      "kl": 0.04827880859375,
      "learning_rate": 7.400572246195628e-06,
      "loss": 0.0019,
      "num_tokens": 130622953.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1877
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.626,
      "grad_norm": 1.2934106008444246e-09,
      "kl": 0.04400634765625,
      "learning_rate": 7.389338462664841e-06,
      "loss": 0.0018,
      "num_tokens": 130699241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1878
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6263333333333333,
      "grad_norm": 2.85818146927852e-09,
      "kl": 0.04840087890625,
      "learning_rate": 7.378108213591355e-06,
      "loss": 0.0019,
      "num_tokens": 130776937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1879
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6266666666666667,
      "grad_norm": 2.1645660819302748e-09,
      "kl": 0.0469970703125,
      "learning_rate": 7.366881514179292e-06,
      "loss": 0.0019,
      "num_tokens": 130854041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1880
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.627,
      "grad_norm": 1.1394856169744116e-09,
      "kl": 0.04351806640625,
      "learning_rate": 7.355658379627981e-06,
      "loss": 0.0017,
      "num_tokens": 130930137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1881
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6273333333333333,
      "grad_norm": 1.5089685057034785e-09,
      "kl": 0.04669189453125,
      "learning_rate": 7.344438825131912e-06,
      "loss": 0.0019,
      "num_tokens": 131004217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1882
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6276666666666667,
      "grad_norm": 2.163821566369961e-09,
      "kl": 0.045166015625,
      "learning_rate": 7.333222865880745e-06,
      "loss": 0.0018,
      "num_tokens": 131079913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1883
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.628,
      "grad_norm": 1.5465370095668618e-09,
      "kl": 0.044677734375,
      "learning_rate": 7.322010517059256e-06,
      "loss": 0.0018,
      "num_tokens": 131159513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1884
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6283333333333333,
      "grad_norm": 1.2963966566914564e-09,
      "kl": 0.04302978515625,
      "learning_rate": 7.310801793847344e-06,
      "loss": 0.0017,
      "num_tokens": 131234809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1885
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6286666666666667,
      "grad_norm": 1.6543231229348976e-09,
      "kl": 0.04510498046875,
      "learning_rate": 7.299596711419994e-06,
      "loss": 0.0018,
      "num_tokens": 131310921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1886
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.629,
      "grad_norm": 1.2287947326328208e-09,
      "kl": 0.044677734375,
      "learning_rate": 7.288395284947263e-06,
      "loss": 0.0018,
      "num_tokens": 131384217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1887
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6293333333333333,
      "grad_norm": 1.821832795734224e-09,
      "kl": 0.04644775390625,
      "learning_rate": 7.277197529594257e-06,
      "loss": 0.0019,
      "num_tokens": 131461817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1888
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6296666666666667,
      "grad_norm": 2.2385659992352203e-09,
      "kl": 0.04937744140625,
      "learning_rate": 7.266003460521116e-06,
      "loss": 0.002,
      "num_tokens": 131537369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1889
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.63,
      "grad_norm": 1.1771449370812093e-09,
      "kl": 0.05133056640625,
      "learning_rate": 7.254813092882989e-06,
      "loss": 0.0021,
      "num_tokens": 131611497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1890
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6303333333333333,
      "grad_norm": 2.183838221370138e-09,
      "kl": 0.0452880859375,
      "learning_rate": 7.243626441830009e-06,
      "loss": 0.0018,
      "num_tokens": 131687081.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1891
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6306666666666667,
      "grad_norm": 2.6615354364878385e-09,
      "kl": 0.0458984375,
      "learning_rate": 7.23244352250728e-06,
      "loss": 0.0018,
      "num_tokens": 131765225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1892
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.631,
      "grad_norm": 2.396752352140652e-09,
      "kl": 0.0482177734375,
      "learning_rate": 7.221264350054855e-06,
      "loss": 0.0019,
      "num_tokens": 131843225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1893
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6313333333333333,
      "grad_norm": 1.9081400903786516e-09,
      "kl": 0.04754638671875,
      "learning_rate": 7.210088939607709e-06,
      "loss": 0.0019,
      "num_tokens": 131919609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1894
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6316666666666667,
      "grad_norm": 2.6077777715016737e-09,
      "kl": 0.04376220703125,
      "learning_rate": 7.1989173062957345e-06,
      "loss": 0.0017,
      "num_tokens": 131998281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1895
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.632,
      "grad_norm": 1.4723079422296337e-09,
      "kl": 0.0445556640625,
      "learning_rate": 7.187749465243694e-06,
      "loss": 0.0018,
      "num_tokens": 132073673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1896
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6323333333333333,
      "grad_norm": 1.6434945626642161e-09,
      "kl": 0.04718017578125,
      "learning_rate": 7.176585431571235e-06,
      "loss": 0.0019,
      "num_tokens": 132148937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1897
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6326666666666667,
      "grad_norm": 1.477611477618268e-09,
      "kl": 0.0438232421875,
      "learning_rate": 7.165425220392839e-06,
      "loss": 0.0018,
      "num_tokens": 132223673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1898
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.633,
      "grad_norm": 1.4106188439555467e-09,
      "kl": 0.04620361328125,
      "learning_rate": 7.154268846817812e-06,
      "loss": 0.0018,
      "num_tokens": 132296713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1899
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6333333333333333,
      "grad_norm": 1.6255169432710659e-09,
      "kl": 0.0469970703125,
      "learning_rate": 7.143116325950266e-06,
      "loss": 0.0019,
      "num_tokens": 132372889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1900
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6336666666666667,
      "grad_norm": 3.496915867984285e-09,
      "kl": 0.0445556640625,
      "learning_rate": 7.131967672889101e-06,
      "loss": 0.0018,
      "num_tokens": 132448089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1901
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.634,
      "grad_norm": 2.3770960755342685e-09,
      "kl": 0.04522705078125,
      "learning_rate": 7.120822902727972e-06,
      "loss": 0.0018,
      "num_tokens": 132526057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1902
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6343333333333333,
      "grad_norm": 1.4412132598451421e-09,
      "kl": 0.04779052734375,
      "learning_rate": 7.109682030555283e-06,
      "loss": 0.0019,
      "num_tokens": 132600793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1903
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6346666666666667,
      "grad_norm": 1.6879673214731383e-09,
      "kl": 0.04522705078125,
      "learning_rate": 7.0985450714541685e-06,
      "loss": 0.0018,
      "num_tokens": 132674953.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1904
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.635,
      "grad_norm": 1.508832281338357e-09,
      "kl": 0.0445556640625,
      "learning_rate": 7.087412040502446e-06,
      "loss": 0.0018,
      "num_tokens": 132751001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1905
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6353333333333333,
      "grad_norm": 1.192776766245629e-09,
      "kl": 0.04534912109375,
      "learning_rate": 7.076282952772634e-06,
      "loss": 0.0018,
      "num_tokens": 132825817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1906
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6356666666666667,
      "grad_norm": 1.5989748414213523e-09,
      "kl": 0.04620361328125,
      "learning_rate": 7.0651578233318986e-06,
      "loss": 0.0019,
      "num_tokens": 132901945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1907
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.636,
      "grad_norm": 1.14927933836384e-09,
      "kl": 0.049560546875,
      "learning_rate": 7.054036667242055e-06,
      "loss": 0.002,
      "num_tokens": 132977033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1908
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6363333333333333,
      "grad_norm": 1.4603932507739614e-09,
      "kl": 0.050048828125,
      "learning_rate": 7.042919499559538e-06,
      "loss": 0.002,
      "num_tokens": 133052217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1909
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6366666666666667,
      "grad_norm": 1.643143843210737e-09,
      "kl": 0.0472412109375,
      "learning_rate": 7.031806335335372e-06,
      "loss": 0.0019,
      "num_tokens": 133128409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1910
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.637,
      "grad_norm": 1.7449706124494924e-09,
      "kl": 0.04669189453125,
      "learning_rate": 7.02069718961518e-06,
      "loss": 0.0019,
      "num_tokens": 133203273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1911
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6373333333333333,
      "grad_norm": 1.849943642717733e-09,
      "kl": 0.0428466796875,
      "learning_rate": 7.009592077439135e-06,
      "loss": 0.0017,
      "num_tokens": 133280057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1912
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6376666666666667,
      "grad_norm": 1.491166523592824e-09,
      "kl": 0.04681396484375,
      "learning_rate": 6.9984910138419434e-06,
      "loss": 0.0019,
      "num_tokens": 133354969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1913
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.638,
      "grad_norm": 1.8732115858455245e-09,
      "kl": 0.04644775390625,
      "learning_rate": 6.987394013852843e-06,
      "loss": 0.0019,
      "num_tokens": 133431449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1914
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6383333333333333,
      "grad_norm": 1.2184496744893636e-09,
      "kl": 0.044189453125,
      "learning_rate": 6.976301092495556e-06,
      "loss": 0.0018,
      "num_tokens": 133506713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1915
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6386666666666667,
      "grad_norm": 1.3719752001151164e-09,
      "kl": 0.04656982421875,
      "learning_rate": 6.9652122647882966e-06,
      "loss": 0.0019,
      "num_tokens": 133585721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1916
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.639,
      "grad_norm": 2.07307415678315e-09,
      "kl": 0.044677734375,
      "learning_rate": 6.9541275457437215e-06,
      "loss": 0.0018,
      "num_tokens": 133662985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1917
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6393333333333333,
      "grad_norm": 1.8676311608345486e-09,
      "kl": 0.04876708984375,
      "learning_rate": 6.943046950368944e-06,
      "loss": 0.0019,
      "num_tokens": 133740041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1918
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6396666666666667,
      "grad_norm": 1.234923496795659e-09,
      "kl": 0.04364013671875,
      "learning_rate": 6.931970493665478e-06,
      "loss": 0.0017,
      "num_tokens": 133813305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1919
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.64,
      "grad_norm": 9.19981757441235e-10,
      "kl": 0.04339599609375,
      "learning_rate": 6.920898190629242e-06,
      "loss": 0.0017,
      "num_tokens": 133891305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1920
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6403333333333333,
      "grad_norm": 2.7641804400957426e-09,
      "kl": 0.046630859375,
      "learning_rate": 6.909830056250527e-06,
      "loss": 0.0019,
      "num_tokens": 133969673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1921
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6406666666666667,
      "grad_norm": 1.6468314489870295e-09,
      "kl": 0.04010009765625,
      "learning_rate": 6.8987661055139865e-06,
      "loss": 0.0016,
      "num_tokens": 134046041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1922
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.641,
      "grad_norm": 2.581733493656202e-09,
      "kl": 0.0498046875,
      "learning_rate": 6.8877063533986025e-06,
      "loss": 0.002,
      "num_tokens": 134125065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1923
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6413333333333333,
      "grad_norm": 1.9296342301799996e-09,
      "kl": 0.045654296875,
      "learning_rate": 6.876650814877675e-06,
      "loss": 0.0018,
      "num_tokens": 134205257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1924
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6416666666666667,
      "grad_norm": 2.031419032988424e-09,
      "kl": 0.046142578125,
      "learning_rate": 6.865599504918805e-06,
      "loss": 0.0018,
      "num_tokens": 134281545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1925
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.642,
      "grad_norm": 2.3758641720661444e-09,
      "kl": 0.0433349609375,
      "learning_rate": 6.854552438483866e-06,
      "loss": 0.0017,
      "num_tokens": 134363017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1926
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6423333333333333,
      "grad_norm": 1.1521709142314762e-09,
      "kl": 0.04876708984375,
      "learning_rate": 6.843509630528977e-06,
      "loss": 0.002,
      "num_tokens": 134438025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1927
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6426666666666667,
      "grad_norm": 2.387333442044337e-09,
      "kl": 0.04522705078125,
      "learning_rate": 6.832471096004505e-06,
      "loss": 0.0018,
      "num_tokens": 134514441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1928
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.643,
      "grad_norm": 2.3476260935240134e-09,
      "kl": 0.04644775390625,
      "learning_rate": 6.821436849855023e-06,
      "loss": 0.0019,
      "num_tokens": 134589913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1929
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6433333333333333,
      "grad_norm": 1.4283827454164566e-09,
      "kl": 0.041748046875,
      "learning_rate": 6.8104069070193e-06,
      "loss": 0.0017,
      "num_tokens": 134664409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1930
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6436666666666667,
      "grad_norm": 1.7663965845571283e-09,
      "kl": 0.04443359375,
      "learning_rate": 6.799381282430284e-06,
      "loss": 0.0018,
      "num_tokens": 134739753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1931
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.644,
      "grad_norm": 1.4734766740076566e-09,
      "kl": 0.04278564453125,
      "learning_rate": 6.78835999101507e-06,
      "loss": 0.0017,
      "num_tokens": 134815129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1932
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6443333333333333,
      "grad_norm": 2.112878982885036e-09,
      "kl": 0.04937744140625,
      "learning_rate": 6.777343047694891e-06,
      "loss": 0.002,
      "num_tokens": 134892665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1933
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6446666666666667,
      "grad_norm": 9.772184172973652e-10,
      "kl": 0.04364013671875,
      "learning_rate": 6.766330467385088e-06,
      "loss": 0.0017,
      "num_tokens": 134967289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1934
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.645,
      "grad_norm": 1.7630332749263289e-09,
      "kl": 0.04925537109375,
      "learning_rate": 6.755322264995099e-06,
      "loss": 0.002,
      "num_tokens": 135041529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1935
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6453333333333333,
      "grad_norm": 1.7300717525259302e-09,
      "kl": 0.0479736328125,
      "learning_rate": 6.744318455428436e-06,
      "loss": 0.0019,
      "num_tokens": 135116697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1936
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6456666666666667,
      "grad_norm": 1.5565276845208587e-09,
      "kl": 0.04833984375,
      "learning_rate": 6.733319053582659e-06,
      "loss": 0.0019,
      "num_tokens": 135192057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1937
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.646,
      "grad_norm": 2.372598117972302e-09,
      "kl": 0.05157470703125,
      "learning_rate": 6.722324074349367e-06,
      "loss": 0.0021,
      "num_tokens": 135268441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1938
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6463333333333333,
      "grad_norm": 1.917065617362823e-09,
      "kl": 0.047607421875,
      "learning_rate": 6.711333532614168e-06,
      "loss": 0.0019,
      "num_tokens": 135343945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1939
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6466666666666666,
      "grad_norm": 1.2906954394154013e-09,
      "kl": 0.04449462890625,
      "learning_rate": 6.700347443256661e-06,
      "loss": 0.0018,
      "num_tokens": 135419961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1940
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.647,
      "grad_norm": 2.011222743902863e-09,
      "kl": 0.04376220703125,
      "learning_rate": 6.689365821150421e-06,
      "loss": 0.0018,
      "num_tokens": 135496089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1941
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6473333333333333,
      "grad_norm": 1.7532705287592876e-09,
      "kl": 0.0498046875,
      "learning_rate": 6.67838868116297e-06,
      "loss": 0.002,
      "num_tokens": 135569785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1942
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6476666666666666,
      "grad_norm": 1.4899961264802641e-09,
      "kl": 0.045166015625,
      "learning_rate": 6.667416038155763e-06,
      "loss": 0.0018,
      "num_tokens": 135643817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1943
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.648,
      "grad_norm": 1.268116722741297e-09,
      "kl": 0.04132080078125,
      "learning_rate": 6.656447906984168e-06,
      "loss": 0.0017,
      "num_tokens": 135717033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1944
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6483333333333333,
      "grad_norm": 9.861612637607209e-10,
      "kl": 0.04486083984375,
      "learning_rate": 6.645484302497452e-06,
      "loss": 0.0018,
      "num_tokens": 135791497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1945
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6486666666666666,
      "grad_norm": 1.6581522821468297e-09,
      "kl": 0.04864501953125,
      "learning_rate": 6.634525239538736e-06,
      "loss": 0.0019,
      "num_tokens": 135867577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1946
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.649,
      "grad_norm": 1.5769037187141066e-09,
      "kl": 0.04638671875,
      "learning_rate": 6.623570732945012e-06,
      "loss": 0.0019,
      "num_tokens": 135941721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1947
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6493333333333333,
      "grad_norm": 1.4770659140239673e-09,
      "kl": 0.04473876953125,
      "learning_rate": 6.612620797547087e-06,
      "loss": 0.0018,
      "num_tokens": 136016985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1948
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6496666666666666,
      "grad_norm": 1.8653953937075585e-09,
      "kl": 0.043701171875,
      "learning_rate": 6.601675448169591e-06,
      "loss": 0.0017,
      "num_tokens": 136092713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1949
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.65,
      "grad_norm": 1.2497890500284825e-09,
      "kl": 0.0428466796875,
      "learning_rate": 6.590734699630939e-06,
      "loss": 0.0017,
      "num_tokens": 136166905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1950
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6503333333333333,
      "grad_norm": 1.3450764946298932e-09,
      "kl": 0.045654296875,
      "learning_rate": 6.579798566743314e-06,
      "loss": 0.0018,
      "num_tokens": 136241785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1951
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6506666666666666,
      "grad_norm": 1.7639859573037597e-09,
      "kl": 0.0438232421875,
      "learning_rate": 6.568867064312661e-06,
      "loss": 0.0018,
      "num_tokens": 136320137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1952
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.651,
      "grad_norm": 1.5360064331559897e-09,
      "kl": 0.0450439453125,
      "learning_rate": 6.5579402071386485e-06,
      "loss": 0.0018,
      "num_tokens": 136394697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1953
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6513333333333333,
      "grad_norm": 2.8081055258866172e-09,
      "kl": 0.04486083984375,
      "learning_rate": 6.547018010014654e-06,
      "loss": 0.0018,
      "num_tokens": 136474649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1954
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6516666666666666,
      "grad_norm": 1.4487635535687104e-09,
      "kl": 0.045166015625,
      "learning_rate": 6.536100487727754e-06,
      "loss": 0.0018,
      "num_tokens": 136548441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1955
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.652,
      "grad_norm": 1.3795813380568234e-09,
      "kl": 0.0482177734375,
      "learning_rate": 6.525187655058687e-06,
      "loss": 0.0019,
      "num_tokens": 136623465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1956
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6523333333333333,
      "grad_norm": 1.2294382178978935e-09,
      "kl": 0.0435791015625,
      "learning_rate": 6.5142795267818505e-06,
      "loss": 0.0017,
      "num_tokens": 136696457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1957
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6526666666666666,
      "grad_norm": 2.855998992856712e-09,
      "kl": 0.04486083984375,
      "learning_rate": 6.503376117665262e-06,
      "loss": 0.0018,
      "num_tokens": 136777929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1958
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.653,
      "grad_norm": 1.5704763045576442e-09,
      "kl": 0.044921875,
      "learning_rate": 6.492477442470566e-06,
      "loss": 0.0018,
      "num_tokens": 136854201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1959
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6533333333333333,
      "grad_norm": 1.36952138518609e-09,
      "kl": 0.04510498046875,
      "learning_rate": 6.481583515952983e-06,
      "loss": 0.0018,
      "num_tokens": 136927865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1960
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6536666666666666,
      "grad_norm": 1.4903973610813637e-09,
      "kl": 0.04425048828125,
      "learning_rate": 6.4706943528613135e-06,
      "loss": 0.0018,
      "num_tokens": 137002473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1961
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.654,
      "grad_norm": 1.578972952387403e-09,
      "kl": 0.04345703125,
      "learning_rate": 6.4598099679379024e-06,
      "loss": 0.0017,
      "num_tokens": 137076761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1962
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6543333333333333,
      "grad_norm": 2.420224687327277e-09,
      "kl": 0.043212890625,
      "learning_rate": 6.448930375918632e-06,
      "loss": 0.0017,
      "num_tokens": 137153625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1963
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6546666666666666,
      "grad_norm": 1.6870497221432856e-09,
      "kl": 0.0423583984375,
      "learning_rate": 6.43805559153289e-06,
      "loss": 0.0017,
      "num_tokens": 137233321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1964
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.655,
      "grad_norm": 2.0265491507132083e-09,
      "kl": 0.04534912109375,
      "learning_rate": 6.427185629503561e-06,
      "loss": 0.0018,
      "num_tokens": 137310649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1965
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6553333333333333,
      "grad_norm": 1.087271717103988e-09,
      "kl": 0.04351806640625,
      "learning_rate": 6.4163205045469975e-06,
      "loss": 0.0017,
      "num_tokens": 137384681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1966
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6556666666666666,
      "grad_norm": 1.3223340200596567e-09,
      "kl": 0.04638671875,
      "learning_rate": 6.405460231373003e-06,
      "loss": 0.0019,
      "num_tokens": 137459017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1967
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.656,
      "grad_norm": 1.7804193674919588e-09,
      "kl": 0.04559326171875,
      "learning_rate": 6.394604824684815e-06,
      "loss": 0.0018,
      "num_tokens": 137534169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1968
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6563333333333333,
      "grad_norm": 1.5786283391605593e-09,
      "kl": 0.04437255859375,
      "learning_rate": 6.383754299179079e-06,
      "loss": 0.0018,
      "num_tokens": 137612953.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1969
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6566666666666666,
      "grad_norm": 1.060493803883844e-09,
      "kl": 0.0469970703125,
      "learning_rate": 6.372908669545832e-06,
      "loss": 0.0019,
      "num_tokens": 137687145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1970
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.657,
      "grad_norm": 1.2245715552694492e-09,
      "kl": 0.0482177734375,
      "learning_rate": 6.362067950468489e-06,
      "loss": 0.0019,
      "num_tokens": 137761769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1971
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6573333333333333,
      "grad_norm": 1.6057961627069517e-09,
      "kl": 0.04473876953125,
      "learning_rate": 6.351232156623803e-06,
      "loss": 0.0018,
      "num_tokens": 137835689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1972
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6576666666666666,
      "grad_norm": 1.0489554780335197e-09,
      "kl": 0.04534912109375,
      "learning_rate": 6.340401302681879e-06,
      "loss": 0.0018,
      "num_tokens": 137911513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1973
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.658,
      "grad_norm": 1.6948649150805295e-09,
      "kl": 0.04632568359375,
      "learning_rate": 6.3295754033061196e-06,
      "loss": 0.0019,
      "num_tokens": 137987129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1974
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6583333333333333,
      "grad_norm": 2.8825064557480573e-09,
      "kl": 0.04730224609375,
      "learning_rate": 6.318754473153221e-06,
      "loss": 0.0019,
      "num_tokens": 138063849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1975
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6586666666666666,
      "grad_norm": 1.1305566482988638e-09,
      "kl": 0.0413818359375,
      "learning_rate": 6.3079385268731575e-06,
      "loss": 0.0017,
      "num_tokens": 138142169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1976
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.659,
      "grad_norm": 1.2178956732000756e-09,
      "kl": 0.0472412109375,
      "learning_rate": 6.29712757910915e-06,
      "loss": 0.0019,
      "num_tokens": 138216697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1977
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6593333333333333,
      "grad_norm": 2.704009460785528e-09,
      "kl": 0.04693603515625,
      "learning_rate": 6.286321644497655e-06,
      "loss": 0.0019,
      "num_tokens": 138293513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1978
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6596666666666666,
      "grad_norm": 1.8339284535429101e-09,
      "kl": 0.04638671875,
      "learning_rate": 6.275520737668338e-06,
      "loss": 0.0019,
      "num_tokens": 138370265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1979
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.66,
      "grad_norm": 2.5290105565289878e-09,
      "kl": 0.0484619140625,
      "learning_rate": 6.26472487324407e-06,
      "loss": 0.0019,
      "num_tokens": 138446601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1980
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6603333333333333,
      "grad_norm": 1.7671978325140003e-09,
      "kl": 0.04559326171875,
      "learning_rate": 6.25393406584088e-06,
      "loss": 0.0018,
      "num_tokens": 138523417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1981
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6606666666666666,
      "grad_norm": 2.218245143126296e-09,
      "kl": 0.04693603515625,
      "learning_rate": 6.243148330067961e-06,
      "loss": 0.0019,
      "num_tokens": 138598041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1982
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.661,
      "grad_norm": 1.7759036463615985e-09,
      "kl": 0.05120849609375,
      "learning_rate": 6.2323676805276315e-06,
      "loss": 0.0021,
      "num_tokens": 138672905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1983
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6613333333333333,
      "grad_norm": 1.692885165383018e-09,
      "kl": 0.04510498046875,
      "learning_rate": 6.22159213181533e-06,
      "loss": 0.0018,
      "num_tokens": 138748617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1984
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6616666666666666,
      "grad_norm": 1.8566247428353222e-09,
      "kl": 0.0465087890625,
      "learning_rate": 6.210821698519592e-06,
      "loss": 0.0019,
      "num_tokens": 138823833.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1985
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.662,
      "grad_norm": 2.2181931846887437e-09,
      "kl": 0.04278564453125,
      "learning_rate": 6.200056395222012e-06,
      "loss": 0.0017,
      "num_tokens": 138901513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1986
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6623333333333333,
      "grad_norm": 2.2822197465188765e-09,
      "kl": 0.04473876953125,
      "learning_rate": 6.18929623649726e-06,
      "loss": 0.0018,
      "num_tokens": 138976569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1987
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6626666666666666,
      "grad_norm": 1.4739386378082031e-09,
      "kl": 0.04766845703125,
      "learning_rate": 6.178541236913029e-06,
      "loss": 0.0019,
      "num_tokens": 139051465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1988
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.663,
      "grad_norm": 1.6366986654858806e-09,
      "kl": 0.04425048828125,
      "learning_rate": 6.167791411030027e-06,
      "loss": 0.0018,
      "num_tokens": 139127529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1989
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6633333333333333,
      "grad_norm": 1.3695743428243645e-09,
      "kl": 0.049072265625,
      "learning_rate": 6.157046773401964e-06,
      "loss": 0.002,
      "num_tokens": 139201753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1990
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6636666666666666,
      "grad_norm": 2.369603624430283e-09,
      "kl": 0.04461669921875,
      "learning_rate": 6.146307338575519e-06,
      "loss": 0.0018,
      "num_tokens": 139279497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1991
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.664,
      "grad_norm": 1.6871206653945592e-09,
      "kl": 0.04547119140625,
      "learning_rate": 6.135573121090327e-06,
      "loss": 0.0018,
      "num_tokens": 139356137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1992
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6643333333333333,
      "grad_norm": 1.5836225664145331e-09,
      "kl": 0.04949951171875,
      "learning_rate": 6.124844135478971e-06,
      "loss": 0.002,
      "num_tokens": 139433321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1993
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6646666666666666,
      "grad_norm": 2.220256423157707e-09,
      "kl": 0.04766845703125,
      "learning_rate": 6.114120396266936e-06,
      "loss": 0.0019,
      "num_tokens": 139509401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1994
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.665,
      "grad_norm": 1.900301693780193e-09,
      "kl": 0.04791259765625,
      "learning_rate": 6.1034019179726115e-06,
      "loss": 0.0019,
      "num_tokens": 139586217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1995
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6653333333333333,
      "grad_norm": 8.209671831238552e-10,
      "kl": 0.04547119140625,
      "learning_rate": 6.092688715107265e-06,
      "loss": 0.0018,
      "num_tokens": 139661241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1996
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6656666666666666,
      "grad_norm": 9.694834934848018e-10,
      "kl": 0.04638671875,
      "learning_rate": 6.081980802175016e-06,
      "loss": 0.0019,
      "num_tokens": 139735881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1997
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.666,
      "grad_norm": 1.608321920087974e-09,
      "kl": 0.04681396484375,
      "learning_rate": 6.071278193672834e-06,
      "loss": 0.0019,
      "num_tokens": 139811689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1998
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6663333333333333,
      "grad_norm": 1.3388211650422477e-09,
      "kl": 0.04205322265625,
      "learning_rate": 6.06058090409049e-06,
      "loss": 0.0017,
      "num_tokens": 139886649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 1999
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6666666666666666,
      "grad_norm": 1.0311029807752448e-09,
      "kl": 0.0474853515625,
      "learning_rate": 6.049888947910569e-06,
      "loss": 0.0019,
      "num_tokens": 139961113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2000
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.667,
      "grad_norm": 1.3561779477200275e-09,
      "kl": 0.04296875,
      "learning_rate": 6.039202339608432e-06,
      "loss": 0.0017,
      "num_tokens": 140034249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2001
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6673333333333333,
      "grad_norm": 1.8810895063836597e-09,
      "kl": 0.04254150390625,
      "learning_rate": 6.028521093652195e-06,
      "loss": 0.0017,
      "num_tokens": 140108361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2002
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6676666666666666,
      "grad_norm": 1.844388641814021e-09,
      "kl": 0.045654296875,
      "learning_rate": 6.0178452245027165e-06,
      "loss": 0.0018,
      "num_tokens": 140184745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2003
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.668,
      "grad_norm": 2.4463555625686695e-09,
      "kl": 0.04541015625,
      "learning_rate": 6.007174746613576e-06,
      "loss": 0.0018,
      "num_tokens": 140263017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2004
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6683333333333333,
      "grad_norm": 1.571687002765998e-09,
      "kl": 0.04901123046875,
      "learning_rate": 5.996509674431053e-06,
      "loss": 0.002,
      "num_tokens": 140339609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2005
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6686666666666666,
      "grad_norm": 1.5943716347166514e-09,
      "kl": 0.04266357421875,
      "learning_rate": 5.9858500223941066e-06,
      "loss": 0.0017,
      "num_tokens": 140415753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2006
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.669,
      "grad_norm": 1.707277763607351e-09,
      "kl": 0.04742431640625,
      "learning_rate": 5.975195804934369e-06,
      "loss": 0.0019,
      "num_tokens": 140497497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2007
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6693333333333333,
      "grad_norm": 1.3581950009111665e-09,
      "kl": 0.04461669921875,
      "learning_rate": 5.9645470364761e-06,
      "loss": 0.0018,
      "num_tokens": 140573241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2008
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6696666666666666,
      "grad_norm": 1.9264798645224346e-09,
      "kl": 0.0455322265625,
      "learning_rate": 5.953903731436191e-06,
      "loss": 0.0018,
      "num_tokens": 140650793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2009
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.67,
      "grad_norm": 2.4505941720320834e-09,
      "kl": 0.03790283203125,
      "learning_rate": 5.943265904224133e-06,
      "loss": 0.0015,
      "num_tokens": 140728105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2010
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6703333333333333,
      "grad_norm": 1.2943666138909293e-09,
      "kl": 0.048583984375,
      "learning_rate": 5.932633569242e-06,
      "loss": 0.0019,
      "num_tokens": 140801273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2011
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6706666666666666,
      "grad_norm": 1.3821218614040731e-09,
      "kl": 0.0423583984375,
      "learning_rate": 5.922006740884436e-06,
      "loss": 0.0017,
      "num_tokens": 140875241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2012
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.671,
      "grad_norm": 2.01482985850987e-09,
      "kl": 0.0430908203125,
      "learning_rate": 5.911385433538621e-06,
      "loss": 0.0017,
      "num_tokens": 140951657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2013
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6713333333333333,
      "grad_norm": 1.3315020197524063e-09,
      "kl": 0.04400634765625,
      "learning_rate": 5.900769661584273e-06,
      "loss": 0.0018,
      "num_tokens": 141026185.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2014
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6716666666666666,
      "grad_norm": 1.3825495193131587e-09,
      "kl": 0.04791259765625,
      "learning_rate": 5.890159439393604e-06,
      "loss": 0.0019,
      "num_tokens": 141101081.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2015
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.672,
      "grad_norm": 1.579926633965556e-09,
      "kl": 0.046630859375,
      "learning_rate": 5.879554781331317e-06,
      "loss": 0.0019,
      "num_tokens": 141174681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2016
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6723333333333333,
      "grad_norm": 1.5863337310406678e-09,
      "kl": 0.04119873046875,
      "learning_rate": 5.868955701754584e-06,
      "loss": 0.0016,
      "num_tokens": 141249417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2017
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6726666666666666,
      "grad_norm": 2.0115535903642012e-09,
      "kl": 0.04156494140625,
      "learning_rate": 5.858362215013018e-06,
      "loss": 0.0017,
      "num_tokens": 141327305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2018
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.673,
      "grad_norm": 1.6781062095461152e-09,
      "kl": 0.044921875,
      "learning_rate": 5.847774335448671e-06,
      "loss": 0.0018,
      "num_tokens": 141403417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2019
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6733333333333333,
      "grad_norm": 1.6332654118045298e-09,
      "kl": 0.04815673828125,
      "learning_rate": 5.83719207739599e-06,
      "loss": 0.0019,
      "num_tokens": 141485225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2020
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6736666666666666,
      "grad_norm": 2.074878269198166e-09,
      "kl": 0.04534912109375,
      "learning_rate": 5.8266154551818225e-06,
      "loss": 0.0018,
      "num_tokens": 141561497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2021
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.674,
      "grad_norm": 2.0747485951488898e-09,
      "kl": 0.042236328125,
      "learning_rate": 5.816044483125381e-06,
      "loss": 0.0017,
      "num_tokens": 141637273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2022
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6743333333333333,
      "grad_norm": 1.8546603142155504e-09,
      "kl": 0.0482177734375,
      "learning_rate": 5.8054791755382286e-06,
      "loss": 0.0019,
      "num_tokens": 141712553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2023
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6746666666666666,
      "grad_norm": 1.969717944305671e-09,
      "kl": 0.0482177734375,
      "learning_rate": 5.7949195467242654e-06,
      "loss": 0.0019,
      "num_tokens": 141787737.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2024
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.675,
      "grad_norm": 1.5042752599114806e-09,
      "kl": 0.04595947265625,
      "learning_rate": 5.784365610979692e-06,
      "loss": 0.0018,
      "num_tokens": 141861449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2025
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6753333333333333,
      "grad_norm": 2.8594180356833476e-09,
      "kl": 0.04730224609375,
      "learning_rate": 5.773817382593008e-06,
      "loss": 0.0019,
      "num_tokens": 141939705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2026
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6756666666666666,
      "grad_norm": 1.5209469239607643e-09,
      "kl": 0.04730224609375,
      "learning_rate": 5.7632748758449865e-06,
      "loss": 0.0019,
      "num_tokens": 142013753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2027
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.676,
      "grad_norm": 1.316886599767031e-09,
      "kl": 0.04486083984375,
      "learning_rate": 5.7527381050086555e-06,
      "loss": 0.0018,
      "num_tokens": 142088249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2028
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6763333333333333,
      "grad_norm": 1.4880415788454115e-09,
      "kl": 0.04595947265625,
      "learning_rate": 5.742207084349274e-06,
      "loss": 0.0018,
      "num_tokens": 142161705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2029
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6766666666666666,
      "grad_norm": 2.9831463965734883e-09,
      "kl": 0.04656982421875,
      "learning_rate": 5.73168182812432e-06,
      "loss": 0.0019,
      "num_tokens": 142240377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2030
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.677,
      "grad_norm": 1.5522306773263495e-09,
      "kl": 0.04071044921875,
      "learning_rate": 5.72116235058346e-06,
      "loss": 0.0016,
      "num_tokens": 142314281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2031
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6773333333333333,
      "grad_norm": 1.1397017773973062e-09,
      "kl": 0.04840087890625,
      "learning_rate": 5.710648665968543e-06,
      "loss": 0.0019,
      "num_tokens": 142389417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2032
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6776666666666666,
      "grad_norm": 1.2189036446841328e-09,
      "kl": 0.0494384765625,
      "learning_rate": 5.700140788513575e-06,
      "loss": 0.002,
      "num_tokens": 142463945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2033
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.678,
      "grad_norm": 2.6483322201897863e-09,
      "kl": 0.04705810546875,
      "learning_rate": 5.689638732444699e-06,
      "loss": 0.0019,
      "num_tokens": 142542073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2034
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6783333333333333,
      "grad_norm": 1.1243739272970288e-09,
      "kl": 0.044189453125,
      "learning_rate": 5.679142511980176e-06,
      "loss": 0.0018,
      "num_tokens": 142615977.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2035
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6786666666666666,
      "grad_norm": 1.3952109467751939e-09,
      "kl": 0.04559326171875,
      "learning_rate": 5.668652141330373e-06,
      "loss": 0.0018,
      "num_tokens": 142691065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2036
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.679,
      "grad_norm": 3.4144658211943124e-09,
      "kl": 0.048583984375,
      "learning_rate": 5.65816763469772e-06,
      "loss": 0.0019,
      "num_tokens": 142768761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2037
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6793333333333333,
      "grad_norm": 1.879545852290221e-09,
      "kl": 0.04730224609375,
      "learning_rate": 5.647689006276727e-06,
      "loss": 0.0019,
      "num_tokens": 142848713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2038
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6796666666666666,
      "grad_norm": 2.209214589043995e-09,
      "kl": 0.04541015625,
      "learning_rate": 5.637216270253934e-06,
      "loss": 0.0018,
      "num_tokens": 142925785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2039
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.68,
      "grad_norm": 1.7563561716116283e-09,
      "kl": 0.04833984375,
      "learning_rate": 5.626749440807915e-06,
      "loss": 0.0019,
      "num_tokens": 143001609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2040
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6803333333333333,
      "grad_norm": 1.4760456190643367e-09,
      "kl": 0.04571533203125,
      "learning_rate": 5.616288532109225e-06,
      "loss": 0.0018,
      "num_tokens": 143077801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2041
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6806666666666666,
      "grad_norm": 1.5510832618303994e-09,
      "kl": 0.0435791015625,
      "learning_rate": 5.605833558320432e-06,
      "loss": 0.0017,
      "num_tokens": 143151849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2042
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.681,
      "grad_norm": 1.6523277190927388e-09,
      "kl": 0.0428466796875,
      "learning_rate": 5.595384533596054e-06,
      "loss": 0.0017,
      "num_tokens": 143226073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2043
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6813333333333333,
      "grad_norm": 2.4642576867961452e-09,
      "kl": 0.0469970703125,
      "learning_rate": 5.584941472082549e-06,
      "loss": 0.0019,
      "num_tokens": 143304521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2044
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6816666666666666,
      "grad_norm": 2.030916324002874e-09,
      "kl": 0.04351806640625,
      "learning_rate": 5.574504387918311e-06,
      "loss": 0.0017,
      "num_tokens": 143380617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2045
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.682,
      "grad_norm": 1.3033975010401377e-09,
      "kl": 0.04571533203125,
      "learning_rate": 5.564073295233645e-06,
      "loss": 0.0018,
      "num_tokens": 143454569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2046
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6823333333333333,
      "grad_norm": 2.7357243137515752e-09,
      "kl": 0.04534912109375,
      "learning_rate": 5.553648208150728e-06,
      "loss": 0.0018,
      "num_tokens": 143532873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2047
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6826666666666666,
      "grad_norm": 1.582678210709787e-09,
      "kl": 0.045166015625,
      "learning_rate": 5.543229140783619e-06,
      "loss": 0.0018,
      "num_tokens": 143607145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2048
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.683,
      "grad_norm": 1.2580937402972836e-09,
      "kl": 0.04180908203125,
      "learning_rate": 5.5328161072382355e-06,
      "loss": 0.0017,
      "num_tokens": 143682169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2049
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6833333333333333,
      "grad_norm": 1.1265001154114884e-09,
      "kl": 0.04913330078125,
      "learning_rate": 5.522409121612304e-06,
      "loss": 0.002,
      "num_tokens": 143756825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2050
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6836666666666666,
      "grad_norm": 1.9295671727093122e-09,
      "kl": 0.04644775390625,
      "learning_rate": 5.512008197995379e-06,
      "loss": 0.0019,
      "num_tokens": 143831769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2051
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.684,
      "grad_norm": 2.35977948292998e-09,
      "kl": 0.04443359375,
      "learning_rate": 5.501613350468802e-06,
      "loss": 0.0018,
      "num_tokens": 143907161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2052
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6843333333333333,
      "grad_norm": 1.4888303923044077e-09,
      "kl": 0.0421142578125,
      "learning_rate": 5.491224593105695e-06,
      "loss": 0.0017,
      "num_tokens": 143986169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2053
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6846666666666666,
      "grad_norm": 1.4937405756754174e-09,
      "kl": 0.04412841796875,
      "learning_rate": 5.480841939970918e-06,
      "loss": 0.0018,
      "num_tokens": 144062345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2054
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.685,
      "grad_norm": 1.4774684808926963e-09,
      "kl": 0.0465087890625,
      "learning_rate": 5.470465405121093e-06,
      "loss": 0.0019,
      "num_tokens": 144139593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2055
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6853333333333333,
      "grad_norm": 2.318879754881209e-09,
      "kl": 0.045166015625,
      "learning_rate": 5.460095002604533e-06,
      "loss": 0.0018,
      "num_tokens": 144214697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2056
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6856666666666666,
      "grad_norm": 2.032983337230121e-09,
      "kl": 0.04827880859375,
      "learning_rate": 5.449730746461264e-06,
      "loss": 0.0019,
      "num_tokens": 144290169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2057
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.686,
      "grad_norm": 2.985671487820696e-09,
      "kl": 0.04425048828125,
      "learning_rate": 5.439372650722985e-06,
      "loss": 0.0018,
      "num_tokens": 144365353.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2058
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6863333333333334,
      "grad_norm": 1.6914258882394506e-09,
      "kl": 0.0467529296875,
      "learning_rate": 5.429020729413062e-06,
      "loss": 0.0019,
      "num_tokens": 144441353.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2059
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6866666666666666,
      "grad_norm": 1.898779133924222e-09,
      "kl": 0.04412841796875,
      "learning_rate": 5.418674996546486e-06,
      "loss": 0.0018,
      "num_tokens": 144517785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2060
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.687,
      "grad_norm": 1.4402256054424356e-09,
      "kl": 0.045654296875,
      "learning_rate": 5.4083354661298816e-06,
      "loss": 0.0018,
      "num_tokens": 144592841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2061
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6873333333333334,
      "grad_norm": 1.7919947747913056e-09,
      "kl": 0.04840087890625,
      "learning_rate": 5.398002152161484e-06,
      "loss": 0.0019,
      "num_tokens": 144668841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2062
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6876666666666666,
      "grad_norm": 2.1313002473988263e-09,
      "kl": 0.0439453125,
      "learning_rate": 5.387675068631093e-06,
      "loss": 0.0018,
      "num_tokens": 144745113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2063
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.688,
      "grad_norm": 1.473969835075195e-09,
      "kl": 0.0474853515625,
      "learning_rate": 5.377354229520086e-06,
      "loss": 0.0019,
      "num_tokens": 144819417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2064
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6883333333333334,
      "grad_norm": 1.8933425938172377e-09,
      "kl": 0.044921875,
      "learning_rate": 5.367039648801386e-06,
      "loss": 0.0018,
      "num_tokens": 144896825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2065
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6886666666666666,
      "grad_norm": 1.482761025073387e-09,
      "kl": 0.0489501953125,
      "learning_rate": 5.356731340439432e-06,
      "loss": 0.002,
      "num_tokens": 144972729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2066
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.689,
      "grad_norm": 1.2817790162600318e-09,
      "kl": 0.04937744140625,
      "learning_rate": 5.346429318390185e-06,
      "loss": 0.002,
      "num_tokens": 145045913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2067
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6893333333333334,
      "grad_norm": 1.8539841883935537e-09,
      "kl": 0.05120849609375,
      "learning_rate": 5.336133596601089e-06,
      "loss": 0.002,
      "num_tokens": 145120665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2068
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6896666666666667,
      "grad_norm": 1.4539929260592999e-09,
      "kl": 0.046630859375,
      "learning_rate": 5.325844189011058e-06,
      "loss": 0.0019,
      "num_tokens": 145195001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2069
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.69,
      "grad_norm": 3.0194735600730382e-09,
      "kl": 0.044921875,
      "learning_rate": 5.31556110955046e-06,
      "loss": 0.0018,
      "num_tokens": 145272713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2070
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6903333333333334,
      "grad_norm": 1.479883660060466e-09,
      "kl": 0.0426025390625,
      "learning_rate": 5.305284372141095e-06,
      "loss": 0.0017,
      "num_tokens": 145348745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2071
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6906666666666667,
      "grad_norm": 1.2946799188284785e-09,
      "kl": 0.04815673828125,
      "learning_rate": 5.2950139906961716e-06,
      "loss": 0.0019,
      "num_tokens": 145422921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2072
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.691,
      "grad_norm": 1.7997818790860265e-09,
      "kl": 0.04925537109375,
      "learning_rate": 5.284749979120299e-06,
      "loss": 0.002,
      "num_tokens": 145498569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2073
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6913333333333334,
      "grad_norm": 1.3114987984508275e-09,
      "kl": 0.0455322265625,
      "learning_rate": 5.274492351309462e-06,
      "loss": 0.0018,
      "num_tokens": 145573865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2074
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6916666666666667,
      "grad_norm": 1.2795651205266267e-09,
      "kl": 0.04296875,
      "learning_rate": 5.2642411211510005e-06,
      "loss": 0.0017,
      "num_tokens": 145653497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2075
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.692,
      "grad_norm": 1.3581488156333421e-09,
      "kl": 0.0458984375,
      "learning_rate": 5.253996302523596e-06,
      "loss": 0.0018,
      "num_tokens": 145727097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2076
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6923333333333334,
      "grad_norm": 1.5171742751007855e-09,
      "kl": 0.04449462890625,
      "learning_rate": 5.243757909297247e-06,
      "loss": 0.0018,
      "num_tokens": 145801849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2077
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6926666666666667,
      "grad_norm": 1.035644126012869e-09,
      "kl": 0.045654296875,
      "learning_rate": 5.233525955333258e-06,
      "loss": 0.0018,
      "num_tokens": 145876761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2078
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.693,
      "grad_norm": 1.6071520780869264e-09,
      "kl": 0.04132080078125,
      "learning_rate": 5.223300454484204e-06,
      "loss": 0.0017,
      "num_tokens": 145953289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2079
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6933333333333334,
      "grad_norm": 1.9325581135376524e-09,
      "kl": 0.04681396484375,
      "learning_rate": 5.213081420593933e-06,
      "loss": 0.0019,
      "num_tokens": 146028025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2080
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6936666666666667,
      "grad_norm": 1.6818265669016341e-09,
      "kl": 0.05078125,
      "learning_rate": 5.202868867497542e-06,
      "loss": 0.002,
      "num_tokens": 146103769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2081
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.694,
      "grad_norm": 1.5958325771947557e-09,
      "kl": 0.04736328125,
      "learning_rate": 5.192662809021334e-06,
      "loss": 0.0019,
      "num_tokens": 146179113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2082
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6943333333333334,
      "grad_norm": 1.9440562493144853e-09,
      "kl": 0.0452880859375,
      "learning_rate": 5.1824632589828465e-06,
      "loss": 0.0018,
      "num_tokens": 146263369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2083
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6946666666666667,
      "grad_norm": 2.4925215225124475e-09,
      "kl": 0.044677734375,
      "learning_rate": 5.172270231190789e-06,
      "loss": 0.0018,
      "num_tokens": 146339257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2084
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.695,
      "grad_norm": 1.077152811390647e-09,
      "kl": 0.04925537109375,
      "learning_rate": 5.162083739445038e-06,
      "loss": 0.002,
      "num_tokens": 146414569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2085
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6953333333333334,
      "grad_norm": 1.532193483200217e-09,
      "kl": 0.041046142578125,
      "learning_rate": 5.151903797536631e-06,
      "loss": 0.0016,
      "num_tokens": 146491769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2086
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6956666666666667,
      "grad_norm": 1.783924008513793e-09,
      "kl": 0.047607421875,
      "learning_rate": 5.141730419247735e-06,
      "loss": 0.0019,
      "num_tokens": 146566969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2087
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.696,
      "grad_norm": 1.5400581920843592e-09,
      "kl": 0.0465087890625,
      "learning_rate": 5.131563618351624e-06,
      "loss": 0.0019,
      "num_tokens": 146641657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2088
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6963333333333334,
      "grad_norm": 1.463492549369505e-09,
      "kl": 0.04302978515625,
      "learning_rate": 5.121403408612672e-06,
      "loss": 0.0017,
      "num_tokens": 146717241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2089
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6966666666666667,
      "grad_norm": 1.8039523208557284e-09,
      "kl": 0.04620361328125,
      "learning_rate": 5.111249803786342e-06,
      "loss": 0.0019,
      "num_tokens": 146794537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2090
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.697,
      "grad_norm": 2.022987111161001e-09,
      "kl": 0.046142578125,
      "learning_rate": 5.101102817619132e-06,
      "loss": 0.0018,
      "num_tokens": 146871529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2091
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6973333333333334,
      "grad_norm": 1.6709652550517262e-09,
      "kl": 0.0401611328125,
      "learning_rate": 5.090962463848592e-06,
      "loss": 0.0016,
      "num_tokens": 146948457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2092
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6976666666666667,
      "grad_norm": 1.714156594445626e-09,
      "kl": 0.0465087890625,
      "learning_rate": 5.080828756203294e-06,
      "loss": 0.0019,
      "num_tokens": 147023577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2093
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.698,
      "grad_norm": 9.676069945285803e-10,
      "kl": 0.04559326171875,
      "learning_rate": 5.070701708402812e-06,
      "loss": 0.0018,
      "num_tokens": 147097897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2094
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6983333333333334,
      "grad_norm": 1.453238973603277e-09,
      "kl": 0.04852294921875,
      "learning_rate": 5.060581334157693e-06,
      "loss": 0.0019,
      "num_tokens": 147173257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2095
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6986666666666667,
      "grad_norm": 2.0685952950572073e-09,
      "kl": 0.04791259765625,
      "learning_rate": 5.05046764716946e-06,
      "loss": 0.0019,
      "num_tokens": 147249721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2096
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.699,
      "grad_norm": 1.3517864605461227e-09,
      "kl": 0.04638671875,
      "learning_rate": 5.04036066113058e-06,
      "loss": 0.0019,
      "num_tokens": 147323913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2097
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6993333333333334,
      "grad_norm": 2.0163619662838528e-09,
      "kl": 0.0394287109375,
      "learning_rate": 5.030260389724447e-06,
      "loss": 0.0016,
      "num_tokens": 147398553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2098
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.6996666666666667,
      "grad_norm": 1.2988320419182742e-09,
      "kl": 0.04437255859375,
      "learning_rate": 5.020166846625365e-06,
      "loss": 0.0018,
      "num_tokens": 147473385.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2099
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7,
      "grad_norm": 1.288446682679023e-09,
      "kl": 0.04736328125,
      "learning_rate": 5.01008004549853e-06,
      "loss": 0.0019,
      "num_tokens": 147548361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2100
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7003333333333334,
      "grad_norm": 1.965429596850754e-09,
      "kl": 0.04327392578125,
      "learning_rate": 5.000000000000003e-06,
      "loss": 0.0017,
      "num_tokens": 147627929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2101
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7006666666666667,
      "grad_norm": 1.3585821356798533e-09,
      "kl": 0.04840087890625,
      "learning_rate": 4.989926723776707e-06,
      "loss": 0.0019,
      "num_tokens": 147704937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2102
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.701,
      "grad_norm": 3.060430797674485e-09,
      "kl": 0.0439453125,
      "learning_rate": 4.979860230466398e-06,
      "loss": 0.0018,
      "num_tokens": 147783881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2103
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7013333333333334,
      "grad_norm": 2.4823232358528458e-09,
      "kl": 0.04034423828125,
      "learning_rate": 4.96980053369765e-06,
      "loss": 0.0016,
      "num_tokens": 147867241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2104
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7016666666666667,
      "grad_norm": 1.5334354896978653e-09,
      "kl": 0.04876708984375,
      "learning_rate": 4.959747647089833e-06,
      "loss": 0.0019,
      "num_tokens": 147942585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2105
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.702,
      "grad_norm": 1.142280492416603e-09,
      "kl": 0.04534912109375,
      "learning_rate": 4.949701584253103e-06,
      "loss": 0.0018,
      "num_tokens": 148021193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2106
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7023333333333334,
      "grad_norm": 1.3960120837097634e-09,
      "kl": 0.0440673828125,
      "learning_rate": 4.939662358788364e-06,
      "loss": 0.0018,
      "num_tokens": 148095577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2107
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7026666666666667,
      "grad_norm": 1.0086537161058118e-09,
      "kl": 0.043212890625,
      "learning_rate": 4.929629984287278e-06,
      "loss": 0.0017,
      "num_tokens": 148169769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2108
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.703,
      "grad_norm": 1.8072595642237843e-09,
      "kl": 0.04205322265625,
      "learning_rate": 4.919604474332223e-06,
      "loss": 0.0017,
      "num_tokens": 148246585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2109
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7033333333333334,
      "grad_norm": 1.6154312332261611e-09,
      "kl": 0.04547119140625,
      "learning_rate": 4.909585842496287e-06,
      "loss": 0.0018,
      "num_tokens": 148322633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2110
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7036666666666667,
      "grad_norm": 1.4079877264094875e-09,
      "kl": 0.0430908203125,
      "learning_rate": 4.899574102343247e-06,
      "loss": 0.0017,
      "num_tokens": 148397705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2111
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.704,
      "grad_norm": 9.878139417551779e-10,
      "kl": 0.04718017578125,
      "learning_rate": 4.889569267427548e-06,
      "loss": 0.0019,
      "num_tokens": 148472697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2112
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7043333333333334,
      "grad_norm": 1.757420431403034e-09,
      "kl": 0.04595947265625,
      "learning_rate": 4.879571351294287e-06,
      "loss": 0.0018,
      "num_tokens": 148546697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2113
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7046666666666667,
      "grad_norm": 1.940094529473413e-09,
      "kl": 0.04449462890625,
      "learning_rate": 4.869580367479187e-06,
      "loss": 0.0018,
      "num_tokens": 148629001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2114
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.705,
      "grad_norm": 1.536891280906616e-09,
      "kl": 0.04962158203125,
      "learning_rate": 4.859596329508598e-06,
      "loss": 0.002,
      "num_tokens": 148705049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2115
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7053333333333334,
      "grad_norm": 1.389648507377217e-09,
      "kl": 0.04736328125,
      "learning_rate": 4.849619250899458e-06,
      "loss": 0.0019,
      "num_tokens": 148782457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2116
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7056666666666667,
      "grad_norm": 1.1227275775738121e-09,
      "kl": 0.0474853515625,
      "learning_rate": 4.8396491451592855e-06,
      "loss": 0.0019,
      "num_tokens": 148856649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2117
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.706,
      "grad_norm": 2.2867983062724306e-09,
      "kl": 0.04827880859375,
      "learning_rate": 4.8296860257861585e-06,
      "loss": 0.0019,
      "num_tokens": 148933225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2118
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7063333333333334,
      "grad_norm": 2.3825512673880667e-09,
      "kl": 0.044189453125,
      "learning_rate": 4.8197299062687e-06,
      "loss": 0.0018,
      "num_tokens": 149014889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2119
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7066666666666667,
      "grad_norm": 1.2605755328465307e-09,
      "kl": 0.046875,
      "learning_rate": 4.809780800086046e-06,
      "loss": 0.0019,
      "num_tokens": 149090249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2120
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.707,
      "grad_norm": 1.584815279009888e-09,
      "kl": 0.05072021484375,
      "learning_rate": 4.799838720707847e-06,
      "loss": 0.002,
      "num_tokens": 149166281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2121
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7073333333333334,
      "grad_norm": 1.2099057311587558e-09,
      "kl": 0.0447998046875,
      "learning_rate": 4.78990368159424e-06,
      "loss": 0.0018,
      "num_tokens": 149241017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2122
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7076666666666667,
      "grad_norm": 1.5133145847556762e-09,
      "kl": 0.044921875,
      "learning_rate": 4.7799756961958195e-06,
      "loss": 0.0018,
      "num_tokens": 149317801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2123
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.708,
      "grad_norm": 1.2313777775219137e-09,
      "kl": 0.04345703125,
      "learning_rate": 4.770054777953647e-06,
      "loss": 0.0017,
      "num_tokens": 149392745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2124
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7083333333333334,
      "grad_norm": 1.6607938357893204e-09,
      "kl": 0.04449462890625,
      "learning_rate": 4.76014094029921e-06,
      "loss": 0.0018,
      "num_tokens": 149467033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2125
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7086666666666667,
      "grad_norm": 1.5748307102825265e-09,
      "kl": 0.0469970703125,
      "learning_rate": 4.7502341966544e-06,
      "loss": 0.0019,
      "num_tokens": 149542585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2126
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.709,
      "grad_norm": 1.1141104705458815e-09,
      "kl": 0.04754638671875,
      "learning_rate": 4.7403345604315135e-06,
      "loss": 0.0019,
      "num_tokens": 149617705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2127
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7093333333333334,
      "grad_norm": 1.163764307143822e-09,
      "kl": 0.04827880859375,
      "learning_rate": 4.7304420450332244e-06,
      "loss": 0.0019,
      "num_tokens": 149690121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2128
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7096666666666667,
      "grad_norm": 1.996229403999905e-09,
      "kl": 0.04461669921875,
      "learning_rate": 4.720556663852569e-06,
      "loss": 0.0018,
      "num_tokens": 149764761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2129
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.71,
      "grad_norm": 1.041037589466498e-09,
      "kl": 0.04473876953125,
      "learning_rate": 4.710678430272907e-06,
      "loss": 0.0018,
      "num_tokens": 149838857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2130
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7103333333333334,
      "grad_norm": 1.8411533408979608e-09,
      "kl": 0.04400634765625,
      "learning_rate": 4.700807357667953e-06,
      "loss": 0.0018,
      "num_tokens": 149917577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2131
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7106666666666667,
      "grad_norm": 1.5097793015783623e-09,
      "kl": 0.0452880859375,
      "learning_rate": 4.690943459401693e-06,
      "loss": 0.0018,
      "num_tokens": 149993753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2132
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.711,
      "grad_norm": 1.459474985310294e-09,
      "kl": 0.04681396484375,
      "learning_rate": 4.681086748828424e-06,
      "loss": 0.0019,
      "num_tokens": 150069993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2133
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7113333333333334,
      "grad_norm": 2.696329159945776e-09,
      "kl": 0.0457763671875,
      "learning_rate": 4.671237239292699e-06,
      "loss": 0.0018,
      "num_tokens": 150147881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2134
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7116666666666667,
      "grad_norm": 2.3898489853735327e-09,
      "kl": 0.04254150390625,
      "learning_rate": 4.661394944129334e-06,
      "loss": 0.0017,
      "num_tokens": 150223785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2135
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.712,
      "grad_norm": 2.07627381954012e-09,
      "kl": 0.04620361328125,
      "learning_rate": 4.65155987666336e-06,
      "loss": 0.0018,
      "num_tokens": 150299689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2136
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7123333333333334,
      "grad_norm": 1.6406819236536307e-09,
      "kl": 0.04632568359375,
      "learning_rate": 4.641732050210032e-06,
      "loss": 0.0019,
      "num_tokens": 150375081.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2137
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7126666666666667,
      "grad_norm": 2.9409696900017934e-09,
      "kl": 0.04730224609375,
      "learning_rate": 4.631911478074815e-06,
      "loss": 0.0019,
      "num_tokens": 150450585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2138
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.713,
      "grad_norm": 1.2244349978374203e-09,
      "kl": 0.04931640625,
      "learning_rate": 4.622098173553329e-06,
      "loss": 0.002,
      "num_tokens": 150526073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2139
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7133333333333334,
      "grad_norm": 1.415452866027067e-09,
      "kl": 0.0465087890625,
      "learning_rate": 4.612292149931369e-06,
      "loss": 0.0019,
      "num_tokens": 150600985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2140
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7136666666666667,
      "grad_norm": 1.3033741863566206e-09,
      "kl": 0.0447998046875,
      "learning_rate": 4.6024934204848745e-06,
      "loss": 0.0018,
      "num_tokens": 150676121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2141
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.714,
      "grad_norm": 2.133075716059807e-09,
      "kl": 0.0462646484375,
      "learning_rate": 4.592701998479896e-06,
      "loss": 0.0018,
      "num_tokens": 150752281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2142
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7143333333333334,
      "grad_norm": 3.084403843445216e-09,
      "kl": 0.04852294921875,
      "learning_rate": 4.582917897172603e-06,
      "loss": 0.0019,
      "num_tokens": 150833913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2143
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7146666666666667,
      "grad_norm": 2.0388870591858677e-09,
      "kl": 0.04669189453125,
      "learning_rate": 4.573141129809252e-06,
      "loss": 0.0019,
      "num_tokens": 150908921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2144
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.715,
      "grad_norm": 1.781347402918243e-09,
      "kl": 0.04766845703125,
      "learning_rate": 4.563371709626167e-06,
      "loss": 0.0019,
      "num_tokens": 150985177.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2145
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7153333333333334,
      "grad_norm": 1.2571244045744834e-09,
      "kl": 0.04083251953125,
      "learning_rate": 4.5536096498497295e-06,
      "loss": 0.0016,
      "num_tokens": 151060057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2146
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7156666666666667,
      "grad_norm": 1.179564002029565e-09,
      "kl": 0.0494384765625,
      "learning_rate": 4.5438549636963534e-06,
      "loss": 0.002,
      "num_tokens": 151136313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2147
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.716,
      "grad_norm": 1.0286048679475357e-09,
      "kl": 0.04669189453125,
      "learning_rate": 4.534107664372466e-06,
      "loss": 0.0019,
      "num_tokens": 151210281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2148
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7163333333333334,
      "grad_norm": 1.909217450801748e-09,
      "kl": 0.04693603515625,
      "learning_rate": 4.524367765074499e-06,
      "loss": 0.0019,
      "num_tokens": 151286841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2149
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7166666666666667,
      "grad_norm": 2.408347743454442e-09,
      "kl": 0.0455322265625,
      "learning_rate": 4.514635278988866e-06,
      "loss": 0.0018,
      "num_tokens": 151367913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2150
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.717,
      "grad_norm": 2.5759745447828664e-09,
      "kl": 0.0472412109375,
      "learning_rate": 4.504910219291941e-06,
      "loss": 0.0019,
      "num_tokens": 151445033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2151
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7173333333333334,
      "grad_norm": 2.2820960676739332e-09,
      "kl": 0.04449462890625,
      "learning_rate": 4.495192599150045e-06,
      "loss": 0.0018,
      "num_tokens": 151527017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2152
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7176666666666667,
      "grad_norm": 1.5311073520152263e-09,
      "kl": 0.04119873046875,
      "learning_rate": 4.4854824317194266e-06,
      "loss": 0.0016,
      "num_tokens": 151602889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2153
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.718,
      "grad_norm": 1.3136572940553037e-09,
      "kl": 0.04449462890625,
      "learning_rate": 4.475779730146252e-06,
      "loss": 0.0018,
      "num_tokens": 151677881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2154
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7183333333333334,
      "grad_norm": 2.50921838862439e-09,
      "kl": 0.04840087890625,
      "learning_rate": 4.46608450756656e-06,
      "loss": 0.0019,
      "num_tokens": 151753161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2155
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7186666666666667,
      "grad_norm": 1.3053715886002237e-09,
      "kl": 0.04107666015625,
      "learning_rate": 4.4563967771062856e-06,
      "loss": 0.0016,
      "num_tokens": 151828425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2156
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.719,
      "grad_norm": 1.403503424590724e-09,
      "kl": 0.04791259765625,
      "learning_rate": 4.446716551881213e-06,
      "loss": 0.0019,
      "num_tokens": 151902697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2157
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7193333333333334,
      "grad_norm": 1.8047752181615806e-09,
      "kl": 0.0455322265625,
      "learning_rate": 4.437043844996952e-06,
      "loss": 0.0018,
      "num_tokens": 151980713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2158
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7196666666666667,
      "grad_norm": 3.648837010317152e-09,
      "kl": 0.04718017578125,
      "learning_rate": 4.427378669548958e-06,
      "loss": 0.0019,
      "num_tokens": 152060745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2159
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.72,
      "grad_norm": 1.6642344169426337e-09,
      "kl": 0.05120849609375,
      "learning_rate": 4.417721038622476e-06,
      "loss": 0.002,
      "num_tokens": 152137721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2160
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7203333333333334,
      "grad_norm": 1.6533822089215278e-09,
      "kl": 0.04241943359375,
      "learning_rate": 4.408070965292534e-06,
      "loss": 0.0017,
      "num_tokens": 152212841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2161
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7206666666666667,
      "grad_norm": 1.6148593573461767e-09,
      "kl": 0.0435791015625,
      "learning_rate": 4.398428462623932e-06,
      "loss": 0.0017,
      "num_tokens": 152288137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2162
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.721,
      "grad_norm": 1.7710726218922446e-09,
      "kl": 0.0526123046875,
      "learning_rate": 4.388793543671225e-06,
      "loss": 0.0021,
      "num_tokens": 152363241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2163
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7213333333333334,
      "grad_norm": 2.2588511061627514e-09,
      "kl": 0.0469970703125,
      "learning_rate": 4.379166221478697e-06,
      "loss": 0.0019,
      "num_tokens": 152439721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2164
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7216666666666667,
      "grad_norm": 1.7437623567317928e-09,
      "kl": 0.048583984375,
      "learning_rate": 4.369546509080338e-06,
      "loss": 0.0019,
      "num_tokens": 152513753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2165
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.722,
      "grad_norm": 1.8338061069655964e-09,
      "kl": 0.0426025390625,
      "learning_rate": 4.359934419499859e-06,
      "loss": 0.0017,
      "num_tokens": 152591145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2166
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7223333333333334,
      "grad_norm": 1.3991672265234456e-09,
      "kl": 0.04705810546875,
      "learning_rate": 4.350329965750622e-06,
      "loss": 0.0019,
      "num_tokens": 152665849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2167
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7226666666666667,
      "grad_norm": 1.6801825486467692e-09,
      "kl": 0.0447998046875,
      "learning_rate": 4.3407331608356715e-06,
      "loss": 0.0018,
      "num_tokens": 152741673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2168
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.723,
      "grad_norm": 1.5790622143185828e-09,
      "kl": 0.04547119140625,
      "learning_rate": 4.33114401774769e-06,
      "loss": 0.0018,
      "num_tokens": 152817337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2169
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7233333333333334,
      "grad_norm": 2.4515320884432867e-09,
      "kl": 0.04583740234375,
      "learning_rate": 4.321562549468991e-06,
      "loss": 0.0018,
      "num_tokens": 152894953.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2170
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7236666666666667,
      "grad_norm": 1.5095218408589517e-09,
      "kl": 0.04205322265625,
      "learning_rate": 4.311988768971484e-06,
      "loss": 0.0017,
      "num_tokens": 152969737.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2171
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.724,
      "grad_norm": 1.595224619066471e-09,
      "kl": 0.04669189453125,
      "learning_rate": 4.302422689216684e-06,
      "loss": 0.0019,
      "num_tokens": 153046505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2172
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7243333333333334,
      "grad_norm": 1.006166483463744e-09,
      "kl": 0.04779052734375,
      "learning_rate": 4.292864323155684e-06,
      "loss": 0.0019,
      "num_tokens": 153121145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2173
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7246666666666667,
      "grad_norm": 1.7706424104702023e-09,
      "kl": 0.0487060546875,
      "learning_rate": 4.2833136837291165e-06,
      "loss": 0.0019,
      "num_tokens": 153195033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2174
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.725,
      "grad_norm": 1.425219608996997e-09,
      "kl": 0.045166015625,
      "learning_rate": 4.273770783867167e-06,
      "loss": 0.0018,
      "num_tokens": 153269817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2175
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7253333333333334,
      "grad_norm": 2.988301606166033e-09,
      "kl": 0.04571533203125,
      "learning_rate": 4.264235636489542e-06,
      "loss": 0.0018,
      "num_tokens": 153346185.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2176
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7256666666666667,
      "grad_norm": 1.2484224765074714e-09,
      "kl": 0.04498291015625,
      "learning_rate": 4.25470825450544e-06,
      "loss": 0.0018,
      "num_tokens": 153421225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2177
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.726,
      "grad_norm": 1.2475552813029367e-09,
      "kl": 0.04345703125,
      "learning_rate": 4.245188650813559e-06,
      "loss": 0.0017,
      "num_tokens": 153500969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2178
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7263333333333334,
      "grad_norm": 1.612631139735754e-09,
      "kl": 0.04486083984375,
      "learning_rate": 4.235676838302069e-06,
      "loss": 0.0018,
      "num_tokens": 153576329.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2179
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7266666666666667,
      "grad_norm": 1.6703100014225924e-09,
      "kl": 0.04779052734375,
      "learning_rate": 4.226172829848576e-06,
      "loss": 0.0019,
      "num_tokens": 153652393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2180
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.727,
      "grad_norm": 1.538385752120064e-09,
      "kl": 0.04876708984375,
      "learning_rate": 4.216676638320135e-06,
      "loss": 0.0019,
      "num_tokens": 153727385.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2181
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7273333333333334,
      "grad_norm": 1.7500321192187585e-09,
      "kl": 0.04718017578125,
      "learning_rate": 4.207188276573214e-06,
      "loss": 0.0019,
      "num_tokens": 153805081.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2182
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7276666666666667,
      "grad_norm": 1.5101997430377878e-09,
      "kl": 0.04705810546875,
      "learning_rate": 4.197707757453675e-06,
      "loss": 0.0019,
      "num_tokens": 153879801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2183
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.728,
      "grad_norm": 1.2000829663350032e-08,
      "kl": 0.0457763671875,
      "learning_rate": 4.188235093796768e-06,
      "loss": 0.0018,
      "num_tokens": 153962745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2184
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7283333333333334,
      "grad_norm": 1.7656077710981322e-09,
      "kl": 0.0460205078125,
      "learning_rate": 4.178770298427107e-06,
      "loss": 0.0018,
      "num_tokens": 154037577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2185
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7286666666666667,
      "grad_norm": 1.1018516099525755e-09,
      "kl": 0.04669189453125,
      "learning_rate": 4.169313384158653e-06,
      "loss": 0.0019,
      "num_tokens": 154111545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2186
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.729,
      "grad_norm": 1.6076042719248562e-09,
      "kl": 0.0478515625,
      "learning_rate": 4.1598643637946975e-06,
      "loss": 0.0019,
      "num_tokens": 154186937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2187
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7293333333333333,
      "grad_norm": 2.422549494340842e-09,
      "kl": 0.04388427734375,
      "learning_rate": 4.150423250127846e-06,
      "loss": 0.0018,
      "num_tokens": 154263385.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2188
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7296666666666667,
      "grad_norm": 2.1332566824128207e-09,
      "kl": 0.04473876953125,
      "learning_rate": 4.140990055939997e-06,
      "loss": 0.0018,
      "num_tokens": 154340809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2189
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.73,
      "grad_norm": 1.4593349861868887e-09,
      "kl": 0.0467529296875,
      "learning_rate": 4.131564794002324e-06,
      "loss": 0.0019,
      "num_tokens": 154414281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2190
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7303333333333333,
      "grad_norm": 1.8784636068858163e-09,
      "kl": 0.0439453125,
      "learning_rate": 4.12214747707527e-06,
      "loss": 0.0018,
      "num_tokens": 154490777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2191
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7306666666666667,
      "grad_norm": 1.466127441673848e-09,
      "kl": 0.04290771484375,
      "learning_rate": 4.1127381179085145e-06,
      "loss": 0.0017,
      "num_tokens": 154566505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2192
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.731,
      "grad_norm": 1.5561664179486456e-09,
      "kl": 0.04632568359375,
      "learning_rate": 4.103336729240967e-06,
      "loss": 0.0019,
      "num_tokens": 154640553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2193
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7313333333333333,
      "grad_norm": 1.3554396494086518e-09,
      "kl": 0.045166015625,
      "learning_rate": 4.093943323800746e-06,
      "loss": 0.0018,
      "num_tokens": 154716201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2194
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7316666666666667,
      "grad_norm": 2.0975106096443596e-09,
      "kl": 0.04559326171875,
      "learning_rate": 4.0845579143051625e-06,
      "loss": 0.0018,
      "num_tokens": 154791705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2195
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.732,
      "grad_norm": 1.335821675496618e-09,
      "kl": 0.04095458984375,
      "learning_rate": 4.075180513460695e-06,
      "loss": 0.0016,
      "num_tokens": 154866137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2196
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7323333333333333,
      "grad_norm": 1.5670132968992334e-09,
      "kl": 0.042236328125,
      "learning_rate": 4.065811133962987e-06,
      "loss": 0.0017,
      "num_tokens": 154941529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2197
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7326666666666667,
      "grad_norm": 1.7830900089776947e-09,
      "kl": 0.044921875,
      "learning_rate": 4.056449788496824e-06,
      "loss": 0.0018,
      "num_tokens": 155017337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2198
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.733,
      "grad_norm": 1.6873773489578525e-09,
      "kl": 0.0445556640625,
      "learning_rate": 4.047096489736102e-06,
      "loss": 0.0018,
      "num_tokens": 155093417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2199
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7333333333333333,
      "grad_norm": 1.994529652549204e-09,
      "kl": 0.04571533203125,
      "learning_rate": 4.037751250343841e-06,
      "loss": 0.0018,
      "num_tokens": 155168889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2200
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7336666666666667,
      "grad_norm": 1.3627780015568192e-09,
      "kl": 0.04937744140625,
      "learning_rate": 4.028414082972141e-06,
      "loss": 0.002,
      "num_tokens": 155242681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2201
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.734,
      "grad_norm": 1.4729567565652246e-09,
      "kl": 0.043701171875,
      "learning_rate": 4.019085000262164e-06,
      "loss": 0.0017,
      "num_tokens": 155318105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2202
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7343333333333333,
      "grad_norm": 1.828477591558908e-09,
      "kl": 0.04541015625,
      "learning_rate": 4.009764014844143e-06,
      "loss": 0.0018,
      "num_tokens": 155391433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2203
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7346666666666667,
      "grad_norm": 2.021040668154228e-09,
      "kl": 0.04583740234375,
      "learning_rate": 4.000451139337338e-06,
      "loss": 0.0018,
      "num_tokens": 155467161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2204
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.735,
      "grad_norm": 2.5734336883687092e-09,
      "kl": 0.04534912109375,
      "learning_rate": 3.9911463863500365e-06,
      "loss": 0.0018,
      "num_tokens": 155544313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2205
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7353333333333333,
      "grad_norm": 1.1503157315573276e-09,
      "kl": 0.0467529296875,
      "learning_rate": 3.981849768479516e-06,
      "loss": 0.0019,
      "num_tokens": 155620089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2206
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7356666666666667,
      "grad_norm": 1.616610290078313e-09,
      "kl": 0.04827880859375,
      "learning_rate": 3.972561298312063e-06,
      "loss": 0.0019,
      "num_tokens": 155696233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2207
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.736,
      "grad_norm": 2.1778323588961257e-09,
      "kl": 0.04852294921875,
      "learning_rate": 3.96328098842291e-06,
      "loss": 0.0019,
      "num_tokens": 155772857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2208
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7363333333333333,
      "grad_norm": 1.3141869814603524e-09,
      "kl": 0.041748046875,
      "learning_rate": 3.954008851376252e-06,
      "loss": 0.0017,
      "num_tokens": 155848377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2209
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7366666666666667,
      "grad_norm": 1.4419735405724055e-09,
      "kl": 0.04840087890625,
      "learning_rate": 3.944744899725221e-06,
      "loss": 0.0019,
      "num_tokens": 155923609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2210
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.737,
      "grad_norm": 1.4683484428346105e-09,
      "kl": 0.0452880859375,
      "learning_rate": 3.9354891460118695e-06,
      "loss": 0.0018,
      "num_tokens": 155998777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2211
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7373333333333333,
      "grad_norm": 1.9341483969981255e-09,
      "kl": 0.0418701171875,
      "learning_rate": 3.9262416027671354e-06,
      "loss": 0.0017,
      "num_tokens": 156075849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2212
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7376666666666667,
      "grad_norm": 2.080253524994191e-09,
      "kl": 0.0482177734375,
      "learning_rate": 3.917002282510854e-06,
      "loss": 0.0019,
      "num_tokens": 156151481.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2213
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.738,
      "grad_norm": 0.00037415779661387205,
      "kl": 0.046875,
      "learning_rate": 3.907771197751737e-06,
      "loss": 0.0019,
      "num_tokens": 156226617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2214
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7383333333333333,
      "grad_norm": 1.9200918632833464e-09,
      "kl": 0.043701171875,
      "learning_rate": 3.898548360987325e-06,
      "loss": 0.0017,
      "num_tokens": 156304889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2215
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7386666666666667,
      "grad_norm": 1.5810110998160098e-09,
      "kl": 0.04522705078125,
      "learning_rate": 3.889333784704003e-06,
      "loss": 0.0018,
      "num_tokens": 156378201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2216
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.739,
      "grad_norm": 1.56274360119113e-09,
      "kl": 0.04571533203125,
      "learning_rate": 3.880127481376975e-06,
      "loss": 0.0018,
      "num_tokens": 156453193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2217
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7393333333333333,
      "grad_norm": 1.0836347374976185e-09,
      "kl": 0.045166015625,
      "learning_rate": 3.8709294634702374e-06,
      "loss": 0.0018,
      "num_tokens": 156527657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2218
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7396666666666667,
      "grad_norm": 1.7463023249675302e-09,
      "kl": 0.0477294921875,
      "learning_rate": 3.861739743436575e-06,
      "loss": 0.0019,
      "num_tokens": 156605993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2219
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.74,
      "grad_norm": 2.3053099429404256e-09,
      "kl": 0.045654296875,
      "learning_rate": 3.852558333717536e-06,
      "loss": 0.0018,
      "num_tokens": 156691833.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2220
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7403333333333333,
      "grad_norm": 1.6341034081435168e-09,
      "kl": 0.0462646484375,
      "learning_rate": 3.8433852467434175e-06,
      "loss": 0.0019,
      "num_tokens": 156766553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2221
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7406666666666667,
      "grad_norm": 1.4812049364820723e-09,
      "kl": 0.04595947265625,
      "learning_rate": 3.834220494933252e-06,
      "loss": 0.0018,
      "num_tokens": 156842393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2222
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.741,
      "grad_norm": 1.509101621444131e-09,
      "kl": 0.046630859375,
      "learning_rate": 3.825064090694785e-06,
      "loss": 0.0019,
      "num_tokens": 156918153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2223
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7413333333333333,
      "grad_norm": 3.709704987642226e-09,
      "kl": 0.04644775390625,
      "learning_rate": 3.81591604642446e-06,
      "loss": 0.0019,
      "num_tokens": 156994569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2224
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7416666666666667,
      "grad_norm": 1.5297644262446397e-09,
      "kl": 0.04473876953125,
      "learning_rate": 3.8067763745074017e-06,
      "loss": 0.0018,
      "num_tokens": 157072681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2225
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.742,
      "grad_norm": 1.262082105490947e-09,
      "kl": 0.046630859375,
      "learning_rate": 3.797645087317401e-06,
      "loss": 0.0019,
      "num_tokens": 157145897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2226
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7423333333333333,
      "grad_norm": 1.6993195739445355e-09,
      "kl": 0.04669189453125,
      "learning_rate": 3.7885221972168974e-06,
      "loss": 0.0019,
      "num_tokens": 157219929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2227
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7426666666666667,
      "grad_norm": 1.5095448224755614e-09,
      "kl": 0.04541015625,
      "learning_rate": 3.779407716556962e-06,
      "loss": 0.0018,
      "num_tokens": 157298521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2228
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.743,
      "grad_norm": 1.6611865216731303e-09,
      "kl": 0.0457763671875,
      "learning_rate": 3.77030165767728e-06,
      "loss": 0.0018,
      "num_tokens": 157371961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2229
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7433333333333333,
      "grad_norm": 1.2625971379520706e-09,
      "kl": 0.0478515625,
      "learning_rate": 3.7612040329061405e-06,
      "loss": 0.0019,
      "num_tokens": 157445657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2230
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7436666666666667,
      "grad_norm": 1.2067157273421003e-09,
      "kl": 0.04266357421875,
      "learning_rate": 3.7521148545604003e-06,
      "loss": 0.0017,
      "num_tokens": 157521177.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2231
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.744,
      "grad_norm": 8.376968563261755e-10,
      "kl": 0.0438232421875,
      "learning_rate": 3.7430341349454924e-06,
      "loss": 0.0018,
      "num_tokens": 157598345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2232
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7443333333333333,
      "grad_norm": 1.495280566032875e-09,
      "kl": 0.04571533203125,
      "learning_rate": 3.7339618863553983e-06,
      "loss": 0.0018,
      "num_tokens": 157674889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2233
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7446666666666667,
      "grad_norm": 1.8390252654043593e-09,
      "kl": 0.04840087890625,
      "learning_rate": 3.7248981210726186e-06,
      "loss": 0.0019,
      "num_tokens": 157751497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2234
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.745,
      "grad_norm": 1.4266673398211083e-09,
      "kl": 0.04388427734375,
      "learning_rate": 3.7158428513681876e-06,
      "loss": 0.0018,
      "num_tokens": 157825913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2235
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7453333333333333,
      "grad_norm": 1.625565237972637e-09,
      "kl": 0.04443359375,
      "learning_rate": 3.7067960895016277e-06,
      "loss": 0.0018,
      "num_tokens": 157899161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2236
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7456666666666667,
      "grad_norm": 1.4522918423409692e-09,
      "kl": 0.04644775390625,
      "learning_rate": 3.6977578477209352e-06,
      "loss": 0.0019,
      "num_tokens": 157975753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2237
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.746,
      "grad_norm": 2.4619606353581958e-09,
      "kl": 0.04443359375,
      "learning_rate": 3.6887281382625838e-06,
      "loss": 0.0018,
      "num_tokens": 158053209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2238
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7463333333333333,
      "grad_norm": 2.241760110877067e-09,
      "kl": 0.04345703125,
      "learning_rate": 3.679706973351491e-06,
      "loss": 0.0017,
      "num_tokens": 158131609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2239
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7466666666666667,
      "grad_norm": 1.5271621744972208e-09,
      "kl": 0.04425048828125,
      "learning_rate": 3.6706943652010073e-06,
      "loss": 0.0018,
      "num_tokens": 158206153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2240
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.747,
      "grad_norm": 2.35271602200271e-09,
      "kl": 0.042236328125,
      "learning_rate": 3.661690326012897e-06,
      "loss": 0.0017,
      "num_tokens": 158288633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2241
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7473333333333333,
      "grad_norm": 1.4481499333030001e-09,
      "kl": 0.046630859375,
      "learning_rate": 3.6526948679773256e-06,
      "loss": 0.0019,
      "num_tokens": 158364537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2242
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7476666666666667,
      "grad_norm": 1.6750054676606396e-09,
      "kl": 0.04803466796875,
      "learning_rate": 3.6437080032728355e-06,
      "loss": 0.0019,
      "num_tokens": 158439609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2243
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.748,
      "grad_norm": 1.4223997535367516e-09,
      "kl": 0.0439453125,
      "learning_rate": 3.634729744066341e-06,
      "loss": 0.0018,
      "num_tokens": 158514809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2244
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7483333333333333,
      "grad_norm": 1.1278374900669519e-09,
      "kl": 0.04345703125,
      "learning_rate": 3.625760102513103e-06,
      "loss": 0.0017,
      "num_tokens": 158589209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2245
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7486666666666667,
      "grad_norm": 1.9592740763130223e-09,
      "kl": 0.0450439453125,
      "learning_rate": 3.6167990907567207e-06,
      "loss": 0.0018,
      "num_tokens": 158667289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2246
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.749,
      "grad_norm": 1.509282587797145e-09,
      "kl": 0.050048828125,
      "learning_rate": 3.6078467209290936e-06,
      "loss": 0.002,
      "num_tokens": 158740137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2247
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7493333333333333,
      "grad_norm": 1.8624178865778163e-09,
      "kl": 0.0458984375,
      "learning_rate": 3.598903005150444e-06,
      "loss": 0.0018,
      "num_tokens": 158815113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2248
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7496666666666667,
      "grad_norm": 1.337460475703267e-09,
      "kl": 0.04266357421875,
      "learning_rate": 3.5899679555292654e-06,
      "loss": 0.0017,
      "num_tokens": 158890697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2249
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.75,
      "grad_norm": 1.4602919984341156e-09,
      "kl": 0.041259765625,
      "learning_rate": 3.5810415841623146e-06,
      "loss": 0.0017,
      "num_tokens": 158966825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2250
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7503333333333333,
      "grad_norm": 2.3775514890189697e-09,
      "kl": 0.04730224609375,
      "learning_rate": 3.5721239031346067e-06,
      "loss": 0.0019,
      "num_tokens": 159043097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2251
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7506666666666667,
      "grad_norm": 1.4219833088802147e-09,
      "kl": 0.044189453125,
      "learning_rate": 3.563214924519394e-06,
      "loss": 0.0018,
      "num_tokens": 159118777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2252
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.751,
      "grad_norm": 1.4575084472667754e-09,
      "kl": 0.04547119140625,
      "learning_rate": 3.554314660378133e-06,
      "loss": 0.0018,
      "num_tokens": 159193993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2253
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7513333333333333,
      "grad_norm": 1.7561726517456577e-09,
      "kl": 0.0458984375,
      "learning_rate": 3.545423122760493e-06,
      "loss": 0.0018,
      "num_tokens": 159268649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2254
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7516666666666667,
      "grad_norm": 1.185661235858504e-09,
      "kl": 0.045166015625,
      "learning_rate": 3.5365403237043373e-06,
      "loss": 0.0018,
      "num_tokens": 159343033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2255
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.752,
      "grad_norm": 1.4063439301992275e-09,
      "kl": 0.04339599609375,
      "learning_rate": 3.527666275235677e-06,
      "loss": 0.0017,
      "num_tokens": 159417033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2256
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7523333333333333,
      "grad_norm": 1.7714633093746102e-09,
      "kl": 0.0460205078125,
      "learning_rate": 3.5188009893686916e-06,
      "loss": 0.0018,
      "num_tokens": 159494425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2257
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7526666666666667,
      "grad_norm": 1.5727866786718891e-09,
      "kl": 0.040771484375,
      "learning_rate": 3.5099444781056956e-06,
      "loss": 0.0016,
      "num_tokens": 159571881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2258
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.753,
      "grad_norm": 1.9258610262085085e-09,
      "kl": 0.04815673828125,
      "learning_rate": 3.5010967534371167e-06,
      "loss": 0.0019,
      "num_tokens": 159647193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2259
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7533333333333333,
      "grad_norm": 1.6801326996329635e-09,
      "kl": 0.04266357421875,
      "learning_rate": 3.492257827341492e-06,
      "loss": 0.0017,
      "num_tokens": 159723369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2260
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7536666666666667,
      "grad_norm": 2.1229209501427704e-09,
      "kl": 0.04486083984375,
      "learning_rate": 3.483427711785449e-06,
      "loss": 0.0018,
      "num_tokens": 159802473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2261
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.754,
      "grad_norm": 1.4113106239221906e-09,
      "kl": 0.04278564453125,
      "learning_rate": 3.474606418723683e-06,
      "loss": 0.0017,
      "num_tokens": 159877369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2262
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7543333333333333,
      "grad_norm": 1.7367685067881666e-09,
      "kl": 0.04779052734375,
      "learning_rate": 3.4657939600989453e-06,
      "loss": 0.0019,
      "num_tokens": 159952393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2263
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7546666666666667,
      "grad_norm": 9.468510420163057e-10,
      "kl": 0.04681396484375,
      "learning_rate": 3.45699034784203e-06,
      "loss": 0.0019,
      "num_tokens": 160026921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2264
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.755,
      "grad_norm": 1.697812224144002e-09,
      "kl": 0.0482177734375,
      "learning_rate": 3.4481955938717514e-06,
      "loss": 0.0019,
      "num_tokens": 160102457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2265
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7553333333333333,
      "grad_norm": 1.3509019458624039e-09,
      "kl": 0.0478515625,
      "learning_rate": 3.4394097100949286e-06,
      "loss": 0.0019,
      "num_tokens": 160176153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2266
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7556666666666667,
      "grad_norm": 1.6573364902683352e-09,
      "kl": 0.04400634765625,
      "learning_rate": 3.4306327084063762e-06,
      "loss": 0.0018,
      "num_tokens": 160252457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2267
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.756,
      "grad_norm": 1.6228497434767064e-09,
      "kl": 0.0472412109375,
      "learning_rate": 3.4218646006888836e-06,
      "loss": 0.0019,
      "num_tokens": 160327673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2268
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7563333333333333,
      "grad_norm": 1.7708124966375749e-09,
      "kl": 0.04443359375,
      "learning_rate": 3.4131053988131947e-06,
      "loss": 0.0018,
      "num_tokens": 160405705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2269
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7566666666666667,
      "grad_norm": 1.0130717376100051e-09,
      "kl": 0.04345703125,
      "learning_rate": 3.4043551146380026e-06,
      "loss": 0.0017,
      "num_tokens": 160479033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2270
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.757,
      "grad_norm": 1.2933142334858871e-09,
      "kl": 0.04498291015625,
      "learning_rate": 3.3956137600099248e-06,
      "loss": 0.0018,
      "num_tokens": 160552425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2271
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7573333333333333,
      "grad_norm": 1.544075423076663e-09,
      "kl": 0.04736328125,
      "learning_rate": 3.3868813467634833e-06,
      "loss": 0.0019,
      "num_tokens": 160626761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2272
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7576666666666667,
      "grad_norm": 1.4945190640602846e-09,
      "kl": 0.048095703125,
      "learning_rate": 3.3781578867211016e-06,
      "loss": 0.0019,
      "num_tokens": 160702905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2273
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.758,
      "grad_norm": 1.3849410507305038e-09,
      "kl": 0.04541015625,
      "learning_rate": 3.3694433916930803e-06,
      "loss": 0.0018,
      "num_tokens": 160778617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2274
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7583333333333333,
      "grad_norm": 1.4927449276669336e-09,
      "kl": 0.0430908203125,
      "learning_rate": 3.360737873477584e-06,
      "loss": 0.0017,
      "num_tokens": 160854937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2275
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7586666666666667,
      "grad_norm": 1.297062901528534e-09,
      "kl": 0.04217529296875,
      "learning_rate": 3.3520413438606215e-06,
      "loss": 0.0017,
      "num_tokens": 160929801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2276
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.759,
      "grad_norm": 1.4698717798466987e-09,
      "kl": 0.048095703125,
      "learning_rate": 3.343353814616036e-06,
      "loss": 0.0019,
      "num_tokens": 161006521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2277
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7593333333333333,
      "grad_norm": 1.4599065289999658e-09,
      "kl": 0.04058837890625,
      "learning_rate": 3.3346752975054763e-06,
      "loss": 0.0016,
      "num_tokens": 161080665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2278
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7596666666666667,
      "grad_norm": 1.3320082814516354e-09,
      "kl": 0.04168701171875,
      "learning_rate": 3.3260058042784014e-06,
      "loss": 0.0017,
      "num_tokens": 161155449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2279
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.76,
      "grad_norm": 1.2108786195952348e-09,
      "kl": 0.0477294921875,
      "learning_rate": 3.3173453466720473e-06,
      "loss": 0.0019,
      "num_tokens": 161229177.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2280
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7603333333333333,
      "grad_norm": 1.9602948153618627e-09,
      "kl": 0.04644775390625,
      "learning_rate": 3.308693936411421e-06,
      "loss": 0.0019,
      "num_tokens": 161304921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2281
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7606666666666667,
      "grad_norm": 2.8977975574662196e-09,
      "kl": 0.04522705078125,
      "learning_rate": 3.3000515852092684e-06,
      "loss": 0.0018,
      "num_tokens": 161381913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2282
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.761,
      "grad_norm": 1.3070662330250116e-09,
      "kl": 0.0399169921875,
      "learning_rate": 3.291418304766092e-06,
      "loss": 0.0016,
      "num_tokens": 161457289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2283
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7613333333333333,
      "grad_norm": 1.643856051281034e-09,
      "kl": 0.0478515625,
      "learning_rate": 3.2827941067700996e-06,
      "loss": 0.0019,
      "num_tokens": 161531529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2284
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7616666666666667,
      "grad_norm": 1.9552484076257315e-09,
      "kl": 0.049072265625,
      "learning_rate": 3.2741790028972e-06,
      "loss": 0.002,
      "num_tokens": 161606649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2285
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.762,
      "grad_norm": 1.3871428450329404e-09,
      "kl": 0.04248046875,
      "learning_rate": 3.265573004810997e-06,
      "loss": 0.0017,
      "num_tokens": 161682041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2286
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7623333333333333,
      "grad_norm": 1.2043264163708045e-09,
      "kl": 0.04296875,
      "learning_rate": 3.2569761241627694e-06,
      "loss": 0.0017,
      "num_tokens": 161756713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2287
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7626666666666667,
      "grad_norm": 9.494676156407422e-10,
      "kl": 0.0458984375,
      "learning_rate": 3.24838837259144e-06,
      "loss": 0.0018,
      "num_tokens": 161829897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2288
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.763,
      "grad_norm": 1.6194705576566548e-09,
      "kl": 0.0482177734375,
      "learning_rate": 3.239809761723579e-06,
      "loss": 0.0019,
      "num_tokens": 161906537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2289
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7633333333333333,
      "grad_norm": 2.926664910418708e-09,
      "kl": 0.04815673828125,
      "learning_rate": 3.2312403031733943e-06,
      "loss": 0.0019,
      "num_tokens": 161983929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2290
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7636666666666667,
      "grad_norm": 2.797790443764825e-09,
      "kl": 0.04498291015625,
      "learning_rate": 3.222680008542678e-06,
      "loss": 0.0018,
      "num_tokens": 162063881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2291
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.764,
      "grad_norm": 1.7914595362711339e-09,
      "kl": 0.04681396484375,
      "learning_rate": 3.2141288894208334e-06,
      "loss": 0.0019,
      "num_tokens": 162138409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2292
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7643333333333333,
      "grad_norm": 2.1897184065977626e-09,
      "kl": 0.04925537109375,
      "learning_rate": 3.2055869573848374e-06,
      "loss": 0.002,
      "num_tokens": 162211673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2293
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7646666666666667,
      "grad_norm": 2.216133276888854e-09,
      "kl": 0.04693603515625,
      "learning_rate": 3.1970542239992244e-06,
      "loss": 0.0019,
      "num_tokens": 162286921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2294
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.765,
      "grad_norm": 1.800288029762953e-09,
      "kl": 0.04779052734375,
      "learning_rate": 3.188530700816078e-06,
      "loss": 0.0019,
      "num_tokens": 162363097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2295
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7653333333333333,
      "grad_norm": 2.3931114867536962e-09,
      "kl": 0.04345703125,
      "learning_rate": 3.1800163993750166e-06,
      "loss": 0.0017,
      "num_tokens": 162438361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2296
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7656666666666667,
      "grad_norm": 1.9011319185580078e-09,
      "kl": 0.04498291015625,
      "learning_rate": 3.1715113312031674e-06,
      "loss": 0.0018,
      "num_tokens": 162514041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2297
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.766,
      "grad_norm": 1.766339630115965e-09,
      "kl": 0.04669189453125,
      "learning_rate": 3.1630155078151626e-06,
      "loss": 0.0019,
      "num_tokens": 162590473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2298
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7663333333333333,
      "grad_norm": 1.2543162064559965e-09,
      "kl": 0.04583740234375,
      "learning_rate": 3.1545289407131128e-06,
      "loss": 0.0018,
      "num_tokens": 162665433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2299
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7666666666666667,
      "grad_norm": 1.3188758973825543e-09,
      "kl": 0.04119873046875,
      "learning_rate": 3.146051641386605e-06,
      "loss": 0.0016,
      "num_tokens": 162741929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2300
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.767,
      "grad_norm": 2.019062028679741e-09,
      "kl": 0.04248046875,
      "learning_rate": 3.1375836213126653e-06,
      "loss": 0.0017,
      "num_tokens": 162820873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2301
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7673333333333333,
      "grad_norm": 1.2310045205410347e-09,
      "kl": 0.04547119140625,
      "learning_rate": 3.1291248919557717e-06,
      "loss": 0.0018,
      "num_tokens": 162896425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2302
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7676666666666667,
      "grad_norm": 1.3521155306506216e-09,
      "kl": 0.046142578125,
      "learning_rate": 3.1206754647678137e-06,
      "loss": 0.0018,
      "num_tokens": 162970121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2303
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.768,
      "grad_norm": 1.940174021441976e-09,
      "kl": 0.04833984375,
      "learning_rate": 3.1122353511880943e-06,
      "loss": 0.0019,
      "num_tokens": 163046377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2304
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7683333333333333,
      "grad_norm": 1.8324327610841351e-09,
      "kl": 0.04742431640625,
      "learning_rate": 3.103804562643302e-06,
      "loss": 0.0019,
      "num_tokens": 163120697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2305
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7686666666666667,
      "grad_norm": 1.0934843031051855e-09,
      "kl": 0.0465087890625,
      "learning_rate": 3.0953831105475064e-06,
      "loss": 0.0019,
      "num_tokens": 163195849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2306
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.769,
      "grad_norm": 1.4514314194968847e-09,
      "kl": 0.047119140625,
      "learning_rate": 3.086971006302125e-06,
      "loss": 0.0019,
      "num_tokens": 163270969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2307
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7693333333333333,
      "grad_norm": 1.7702331822633255e-09,
      "kl": 0.0458984375,
      "learning_rate": 3.0785682612959334e-06,
      "loss": 0.0018,
      "num_tokens": 163345817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2308
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7696666666666667,
      "grad_norm": 1.3495302653154795e-09,
      "kl": 0.04681396484375,
      "learning_rate": 3.0701748869050285e-06,
      "loss": 0.0019,
      "num_tokens": 163420313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2309
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.77,
      "grad_norm": 1.9393309180770757e-09,
      "kl": 0.0455322265625,
      "learning_rate": 3.0617908944928223e-06,
      "loss": 0.0018,
      "num_tokens": 163494665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2310
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7703333333333333,
      "grad_norm": 1.882447975276591e-09,
      "kl": 0.0418701171875,
      "learning_rate": 3.0534162954100264e-06,
      "loss": 0.0017,
      "num_tokens": 163570681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2311
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7706666666666667,
      "grad_norm": 1.077265610049949e-09,
      "kl": 0.04473876953125,
      "learning_rate": 3.0450511009946373e-06,
      "loss": 0.0018,
      "num_tokens": 163646169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2312
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.771,
      "grad_norm": 1.3503392848335238e-09,
      "kl": 0.040771484375,
      "learning_rate": 3.0366953225719076e-06,
      "loss": 0.0016,
      "num_tokens": 163722537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2313
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7713333333333333,
      "grad_norm": 1.962284779111201e-09,
      "kl": 0.04376220703125,
      "learning_rate": 3.028348971454356e-06,
      "loss": 0.0017,
      "num_tokens": 163798409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2314
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7716666666666666,
      "grad_norm": 1.641190849888119e-09,
      "kl": 0.04571533203125,
      "learning_rate": 3.0200120589417293e-06,
      "loss": 0.0018,
      "num_tokens": 163873417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2315
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.772,
      "grad_norm": 2.3541986138297943e-09,
      "kl": 0.04461669921875,
      "learning_rate": 3.0116845963209996e-06,
      "loss": 0.0018,
      "num_tokens": 163954057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2316
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7723333333333333,
      "grad_norm": 2.1685944151528247e-09,
      "kl": 0.044677734375,
      "learning_rate": 3.003366594866345e-06,
      "loss": 0.0018,
      "num_tokens": 164029993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2317
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7726666666666666,
      "grad_norm": 1.3567931222979723e-09,
      "kl": 0.04339599609375,
      "learning_rate": 2.995058065839136e-06,
      "loss": 0.0017,
      "num_tokens": 164105609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2318
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.773,
      "grad_norm": 1.2452002762231018e-09,
      "kl": 0.04486083984375,
      "learning_rate": 2.9867590204879117e-06,
      "loss": 0.0018,
      "num_tokens": 164180921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2319
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7733333333333333,
      "grad_norm": 1.4944585569054425e-09,
      "kl": 0.04345703125,
      "learning_rate": 2.978469470048376e-06,
      "loss": 0.0017,
      "num_tokens": 164255977.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2320
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7736666666666666,
      "grad_norm": 1.7076284830608301e-09,
      "kl": 0.047119140625,
      "learning_rate": 2.970189425743383e-06,
      "loss": 0.0019,
      "num_tokens": 164330345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2321
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.774,
      "grad_norm": 1.4634683465075682e-09,
      "kl": 0.04388427734375,
      "learning_rate": 2.961918898782914e-06,
      "loss": 0.0018,
      "num_tokens": 164407049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2322
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7743333333333333,
      "grad_norm": 1.6869347030379345e-09,
      "kl": 0.0457763671875,
      "learning_rate": 2.953657900364053e-06,
      "loss": 0.0018,
      "num_tokens": 164482073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2323
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7746666666666666,
      "grad_norm": 1.0466174593659616e-09,
      "kl": 0.04376220703125,
      "learning_rate": 2.945406441671005e-06,
      "loss": 0.0018,
      "num_tokens": 164559065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2324
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.775,
      "grad_norm": 1.0007918938015337e-09,
      "kl": 0.0419921875,
      "learning_rate": 2.9371645338750477e-06,
      "loss": 0.0017,
      "num_tokens": 164634313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2325
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7753333333333333,
      "grad_norm": 2.378643948475201e-09,
      "kl": 0.04864501953125,
      "learning_rate": 2.9289321881345257e-06,
      "loss": 0.0019,
      "num_tokens": 164712217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2326
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7756666666666666,
      "grad_norm": 1.942564331613994e-09,
      "kl": 0.0478515625,
      "learning_rate": 2.9207094155948435e-06,
      "loss": 0.0019,
      "num_tokens": 164790921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2327
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.776,
      "grad_norm": 1.4715664242714865e-09,
      "kl": 0.04351806640625,
      "learning_rate": 2.912496227388446e-06,
      "loss": 0.0017,
      "num_tokens": 164866057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2328
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7763333333333333,
      "grad_norm": 1.4215539856365922e-09,
      "kl": 0.04486083984375,
      "learning_rate": 2.9042926346347932e-06,
      "loss": 0.0018,
      "num_tokens": 164940665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2329
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7766666666666666,
      "grad_norm": 1.0158451857478212e-09,
      "kl": 0.043701171875,
      "learning_rate": 2.896098648440362e-06,
      "loss": 0.0017,
      "num_tokens": 165015929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2330
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.777,
      "grad_norm": 1.7840733335106052e-09,
      "kl": 0.0423583984375,
      "learning_rate": 2.8879142798986293e-06,
      "loss": 0.0017,
      "num_tokens": 165093049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2331
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7773333333333333,
      "grad_norm": 1.058192755643006e-09,
      "kl": 0.0423583984375,
      "learning_rate": 2.8797395400900362e-06,
      "loss": 0.0017,
      "num_tokens": 165168729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2332
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7776666666666666,
      "grad_norm": 1.3663588038781427e-09,
      "kl": 0.0498046875,
      "learning_rate": 2.8715744400819976e-06,
      "loss": 0.002,
      "num_tokens": 165244489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2333
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.778,
      "grad_norm": 1.1639131880514242e-09,
      "kl": 0.04754638671875,
      "learning_rate": 2.863418990928876e-06,
      "loss": 0.0019,
      "num_tokens": 165319865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2334
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7783333333333333,
      "grad_norm": 2.2448625180970794e-09,
      "kl": 0.04278564453125,
      "learning_rate": 2.855273203671969e-06,
      "loss": 0.0017,
      "num_tokens": 165396313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2335
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7786666666666666,
      "grad_norm": 1.9980463950020066e-09,
      "kl": 0.0447998046875,
      "learning_rate": 2.8471370893394866e-06,
      "loss": 0.0018,
      "num_tokens": 165474361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2336
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.779,
      "grad_norm": 1.5071405234934332e-09,
      "kl": 0.0447998046875,
      "learning_rate": 2.8390106589465514e-06,
      "loss": 0.0018,
      "num_tokens": 165546777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2337
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7793333333333333,
      "grad_norm": 1.938062599293744e-09,
      "kl": 0.046630859375,
      "learning_rate": 2.830893923495173e-06,
      "loss": 0.0019,
      "num_tokens": 165626217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2338
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7796666666666666,
      "grad_norm": 1.4758693156480263e-09,
      "kl": 0.04693603515625,
      "learning_rate": 2.8227868939742333e-06,
      "loss": 0.0019,
      "num_tokens": 165706105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2339
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.78,
      "grad_norm": 1.8107634280895013e-09,
      "kl": 0.043701171875,
      "learning_rate": 2.8146895813594754e-06,
      "loss": 0.0018,
      "num_tokens": 165781481.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2340
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7803333333333333,
      "grad_norm": 1.6678356473676104e-09,
      "kl": 0.04522705078125,
      "learning_rate": 2.8066019966134907e-06,
      "loss": 0.0018,
      "num_tokens": 165857641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2341
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7806666666666666,
      "grad_norm": 1.5116690121885767e-09,
      "kl": 0.04693603515625,
      "learning_rate": 2.79852415068569e-06,
      "loss": 0.0019,
      "num_tokens": 165932601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2342
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.781,
      "grad_norm": 2.192241277398921e-09,
      "kl": 0.046875,
      "learning_rate": 2.7904560545123082e-06,
      "loss": 0.0019,
      "num_tokens": 166011609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2343
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7813333333333333,
      "grad_norm": 1.1048826298321046e-09,
      "kl": 0.046875,
      "learning_rate": 2.7823977190163788e-06,
      "loss": 0.0019,
      "num_tokens": 166087257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2344
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7816666666666666,
      "grad_norm": 1.3459784398150987e-09,
      "kl": 0.047119140625,
      "learning_rate": 2.7743491551077197e-06,
      "loss": 0.0019,
      "num_tokens": 166162281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2345
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.782,
      "grad_norm": 1.7745145353131875e-09,
      "kl": 0.0438232421875,
      "learning_rate": 2.76631037368292e-06,
      "loss": 0.0018,
      "num_tokens": 166237513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2346
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7823333333333333,
      "grad_norm": 1.6565930849310462e-09,
      "kl": 0.0460205078125,
      "learning_rate": 2.7582813856253276e-06,
      "loss": 0.0018,
      "num_tokens": 166314089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2347
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7826666666666666,
      "grad_norm": 1.580381492338745e-09,
      "kl": 0.04852294921875,
      "learning_rate": 2.750262201805022e-06,
      "loss": 0.0019,
      "num_tokens": 166388905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2348
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.783,
      "grad_norm": 2.375486696237772e-09,
      "kl": 0.05133056640625,
      "learning_rate": 2.742252833078818e-06,
      "loss": 0.0021,
      "num_tokens": 166465465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2349
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7833333333333333,
      "grad_norm": 1.5615480020159112e-09,
      "kl": 0.05078125,
      "learning_rate": 2.7342532902902418e-06,
      "loss": 0.002,
      "num_tokens": 166541049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2350
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7836666666666666,
      "grad_norm": 1.1830009194468971e-09,
      "kl": 0.040771484375,
      "learning_rate": 2.726263584269513e-06,
      "loss": 0.0016,
      "num_tokens": 166616697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2351
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.784,
      "grad_norm": 9.003793821626971e-10,
      "kl": 0.05426025390625,
      "learning_rate": 2.718283725833537e-06,
      "loss": 0.0022,
      "num_tokens": 166691801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2352
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7843333333333333,
      "grad_norm": 1.589586351435912e-09,
      "kl": 0.04638671875,
      "learning_rate": 2.7103137257858867e-06,
      "loss": 0.0019,
      "num_tokens": 166765817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2353
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7846666666666666,
      "grad_norm": 1.4814635074245075e-09,
      "kl": 0.04248046875,
      "learning_rate": 2.7023535949167825e-06,
      "loss": 0.0017,
      "num_tokens": 166842025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2354
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.785,
      "grad_norm": 2.4501907169849346e-09,
      "kl": 0.04644775390625,
      "learning_rate": 2.6944033440030894e-06,
      "loss": 0.0019,
      "num_tokens": 166921817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2355
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7853333333333333,
      "grad_norm": 1.3121582709274549e-09,
      "kl": 0.04400634765625,
      "learning_rate": 2.6864629838082957e-06,
      "loss": 0.0018,
      "num_tokens": 166996009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2356
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7856666666666666,
      "grad_norm": 1.3741502380426596e-09,
      "kl": 0.045166015625,
      "learning_rate": 2.678532525082498e-06,
      "loss": 0.0018,
      "num_tokens": 167071689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2357
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.786,
      "grad_norm": 1.5541981035482877e-09,
      "kl": 0.0478515625,
      "learning_rate": 2.670611978562386e-06,
      "loss": 0.0019,
      "num_tokens": 167148217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2358
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7863333333333333,
      "grad_norm": 1.3386144415150625e-09,
      "kl": 0.04669189453125,
      "learning_rate": 2.6627013549712355e-06,
      "loss": 0.0019,
      "num_tokens": 167223721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2359
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7866666666666666,
      "grad_norm": 1.6004723102369667e-09,
      "kl": 0.04437255859375,
      "learning_rate": 2.654800665018884e-06,
      "loss": 0.0018,
      "num_tokens": 167298889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2360
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.787,
      "grad_norm": 1.6842296446384353e-09,
      "kl": 0.0457763671875,
      "learning_rate": 2.6469099194017144e-06,
      "loss": 0.0018,
      "num_tokens": 167376121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2361
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7873333333333333,
      "grad_norm": 9.208410145511436e-10,
      "kl": 0.04443359375,
      "learning_rate": 2.639029128802657e-06,
      "loss": 0.0018,
      "num_tokens": 167450233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2362
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7876666666666666,
      "grad_norm": 1.051167486387783e-09,
      "kl": 0.04791259765625,
      "learning_rate": 2.6311583038911625e-06,
      "loss": 0.0019,
      "num_tokens": 167524985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2363
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.788,
      "grad_norm": 2.5571273987168297e-09,
      "kl": 0.04632568359375,
      "learning_rate": 2.623297455323177e-06,
      "loss": 0.0018,
      "num_tokens": 167601401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2364
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7883333333333333,
      "grad_norm": 2.6806503683474148e-09,
      "kl": 0.045654296875,
      "learning_rate": 2.615446593741161e-06,
      "loss": 0.0018,
      "num_tokens": 167679097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2365
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7886666666666666,
      "grad_norm": 1.7121318807156172e-09,
      "kl": 0.05291748046875,
      "learning_rate": 2.607605729774041e-06,
      "loss": 0.0021,
      "num_tokens": 167754297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2366
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.789,
      "grad_norm": 1.2956629102944817e-09,
      "kl": 0.0440673828125,
      "learning_rate": 2.5997748740372053e-06,
      "loss": 0.0018,
      "num_tokens": 167830233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2367
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7893333333333333,
      "grad_norm": 2.3688029315849235e-09,
      "kl": 0.04931640625,
      "learning_rate": 2.5919540371325005e-06,
      "loss": 0.002,
      "num_tokens": 167907673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2368
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7896666666666666,
      "grad_norm": 9.83351400307697e-10,
      "kl": 0.04486083984375,
      "learning_rate": 2.584143229648207e-06,
      "loss": 0.0018,
      "num_tokens": 167983145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2369
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.79,
      "grad_norm": 1.5275106735046506e-09,
      "kl": 0.043701171875,
      "learning_rate": 2.576342462159024e-06,
      "loss": 0.0017,
      "num_tokens": 168057593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2370
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7903333333333333,
      "grad_norm": 1.900793966669312e-09,
      "kl": 0.0477294921875,
      "learning_rate": 2.5685517452260566e-06,
      "loss": 0.0019,
      "num_tokens": 168134409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2371
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7906666666666666,
      "grad_norm": 2.0142010281887224e-09,
      "kl": 0.04229736328125,
      "learning_rate": 2.5607710893968165e-06,
      "loss": 0.0017,
      "num_tokens": 168213913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2372
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.791,
      "grad_norm": 1.7203960478440194e-09,
      "kl": 0.0479736328125,
      "learning_rate": 2.5530005052051742e-06,
      "loss": 0.0019,
      "num_tokens": 168289689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2373
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7913333333333333,
      "grad_norm": 2.5012092397247443e-09,
      "kl": 0.05194091796875,
      "learning_rate": 2.5452400031713786e-06,
      "loss": 0.0021,
      "num_tokens": 168366169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2374
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7916666666666666,
      "grad_norm": 2.4596449321734326e-09,
      "kl": 0.04766845703125,
      "learning_rate": 2.5374895938020226e-06,
      "loss": 0.0019,
      "num_tokens": 168446025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2375
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.792,
      "grad_norm": 1.0837808428476592e-09,
      "kl": 0.044921875,
      "learning_rate": 2.529749287590042e-06,
      "loss": 0.0018,
      "num_tokens": 168520761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2376
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7923333333333333,
      "grad_norm": 1.77758718855614e-09,
      "kl": 0.0482177734375,
      "learning_rate": 2.522019095014683e-06,
      "loss": 0.0019,
      "num_tokens": 168595337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2377
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7926666666666666,
      "grad_norm": 2.5617037380243346e-09,
      "kl": 0.0482177734375,
      "learning_rate": 2.514299026541508e-06,
      "loss": 0.0019,
      "num_tokens": 168672425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2378
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.793,
      "grad_norm": 1.6600208985195763e-09,
      "kl": 0.04388427734375,
      "learning_rate": 2.506589092622371e-06,
      "loss": 0.0018,
      "num_tokens": 168748361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2379
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7933333333333333,
      "grad_norm": 1.4085200783497953e-09,
      "kl": 0.042724609375,
      "learning_rate": 2.4988893036954045e-06,
      "loss": 0.0017,
      "num_tokens": 168823993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2380
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7936666666666666,
      "grad_norm": 2.6387643181635667e-09,
      "kl": 0.0457763671875,
      "learning_rate": 2.4911996701850083e-06,
      "loss": 0.0018,
      "num_tokens": 168900089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2381
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.794,
      "grad_norm": 1.388531623014444e-09,
      "kl": 0.04876708984375,
      "learning_rate": 2.4835202025018325e-06,
      "loss": 0.002,
      "num_tokens": 168974969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2382
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7943333333333333,
      "grad_norm": 1.4662665526188334e-09,
      "kl": 0.048095703125,
      "learning_rate": 2.4758509110427576e-06,
      "loss": 0.0019,
      "num_tokens": 169049641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2383
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7946666666666666,
      "grad_norm": 1.5331171887567052e-09,
      "kl": 0.04803466796875,
      "learning_rate": 2.468191806190897e-06,
      "loss": 0.0019,
      "num_tokens": 169124025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2384
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.795,
      "grad_norm": 1.3462297943078738e-09,
      "kl": 0.0418701171875,
      "learning_rate": 2.4605428983155667e-06,
      "loss": 0.0017,
      "num_tokens": 169201065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2385
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7953333333333333,
      "grad_norm": 1.4274281756598839e-09,
      "kl": 0.04583740234375,
      "learning_rate": 2.45290419777228e-06,
      "loss": 0.0018,
      "num_tokens": 169277545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2386
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7956666666666666,
      "grad_norm": 2.1397907890019496e-09,
      "kl": 0.046142578125,
      "learning_rate": 2.4452757149027308e-06,
      "loss": 0.0018,
      "num_tokens": 169349929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2387
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.796,
      "grad_norm": 1.7044946565292207e-09,
      "kl": 0.0472412109375,
      "learning_rate": 2.4376574600347803e-06,
      "loss": 0.0019,
      "num_tokens": 169427225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2388
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7963333333333333,
      "grad_norm": 1.203796506921151e-09,
      "kl": 0.044921875,
      "learning_rate": 2.4300494434824373e-06,
      "loss": 0.0018,
      "num_tokens": 169502233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2389
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7966666666666666,
      "grad_norm": 1.950270611672522e-09,
      "kl": 0.04608154296875,
      "learning_rate": 2.422451675545855e-06,
      "loss": 0.0018,
      "num_tokens": 169578025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2390
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.797,
      "grad_norm": 1.900799073695225e-09,
      "kl": 0.04547119140625,
      "learning_rate": 2.4148641665113116e-06,
      "loss": 0.0018,
      "num_tokens": 169658729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2391
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7973333333333333,
      "grad_norm": 1.4691889926865542e-09,
      "kl": 0.045166015625,
      "learning_rate": 2.407286926651192e-06,
      "loss": 0.0018,
      "num_tokens": 169733769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2392
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7976666666666666,
      "grad_norm": 1.5835007749487318e-09,
      "kl": 0.04644775390625,
      "learning_rate": 2.3997199662239825e-06,
      "loss": 0.0019,
      "num_tokens": 169809785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2393
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.798,
      "grad_norm": 2.099869611527083e-09,
      "kl": 0.04669189453125,
      "learning_rate": 2.39216329547425e-06,
      "loss": 0.0019,
      "num_tokens": 169887673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2394
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7983333333333333,
      "grad_norm": 1.0801550764938384e-09,
      "kl": 0.04376220703125,
      "learning_rate": 2.3846169246326345e-06,
      "loss": 0.0018,
      "num_tokens": 169963209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2395
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7986666666666666,
      "grad_norm": 1.4090657529663986e-09,
      "kl": 0.04681396484375,
      "learning_rate": 2.3770808639158216e-06,
      "loss": 0.0019,
      "num_tokens": 170037609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2396
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.799,
      "grad_norm": 1.9750074908841952e-09,
      "kl": 0.0457763671875,
      "learning_rate": 2.3695551235265492e-06,
      "loss": 0.0018,
      "num_tokens": 170113977.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2397
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7993333333333333,
      "grad_norm": 8.834233899968069e-10,
      "kl": 0.04327392578125,
      "learning_rate": 2.362039713653581e-06,
      "loss": 0.0017,
      "num_tokens": 170190009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2398
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.7996666666666666,
      "grad_norm": 2.0394412825197605e-09,
      "kl": 0.0478515625,
      "learning_rate": 2.3545346444716842e-06,
      "loss": 0.0019,
      "num_tokens": 170267177.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2399
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8,
      "grad_norm": 3.6643641454503495e-09,
      "kl": 0.0443115234375,
      "learning_rate": 2.347039926141644e-06,
      "loss": 0.0018,
      "num_tokens": 170342633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2400
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8003333333333333,
      "grad_norm": 2.8849962419030817e-09,
      "kl": 0.04779052734375,
      "learning_rate": 2.339555568810221e-06,
      "loss": 0.0019,
      "num_tokens": 170420201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2401
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8006666666666666,
      "grad_norm": 1.7071533076062906e-09,
      "kl": 0.0509033203125,
      "learning_rate": 2.332081582610146e-06,
      "loss": 0.002,
      "num_tokens": 170497865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2402
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.801,
      "grad_norm": 2.214656458221498e-09,
      "kl": 0.044921875,
      "learning_rate": 2.324617977660114e-06,
      "loss": 0.0018,
      "num_tokens": 170573561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2403
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8013333333333333,
      "grad_norm": 1.0804485084392468e-09,
      "kl": 0.044677734375,
      "learning_rate": 2.317164764064769e-06,
      "loss": 0.0018,
      "num_tokens": 170649593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2404
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8016666666666666,
      "grad_norm": 1.6635335331471879e-09,
      "kl": 0.043212890625,
      "learning_rate": 2.309721951914675e-06,
      "loss": 0.0017,
      "num_tokens": 170724281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2405
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.802,
      "grad_norm": 1.2555982920048336e-09,
      "kl": 0.04559326171875,
      "learning_rate": 2.3022895512863207e-06,
      "loss": 0.0018,
      "num_tokens": 170799065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2406
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8023333333333333,
      "grad_norm": 1.9337043077882754e-09,
      "kl": 0.043701171875,
      "learning_rate": 2.2948675722421086e-06,
      "loss": 0.0017,
      "num_tokens": 170880457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2407
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8026666666666666,
      "grad_norm": 1.6985787221202031e-09,
      "kl": 0.0426025390625,
      "learning_rate": 2.2874560248303136e-06,
      "loss": 0.0017,
      "num_tokens": 170960617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2408
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.803,
      "grad_norm": 1.3114532793068179e-09,
      "kl": 0.04541015625,
      "learning_rate": 2.2800549190850997e-06,
      "loss": 0.0018,
      "num_tokens": 171035641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2409
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8033333333333333,
      "grad_norm": 2.086041117621562e-09,
      "kl": 0.04412841796875,
      "learning_rate": 2.27266426502649e-06,
      "loss": 0.0018,
      "num_tokens": 171110857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2410
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8036666666666666,
      "grad_norm": 1.9857901989439597e-09,
      "kl": 0.04803466796875,
      "learning_rate": 2.265284072660362e-06,
      "loss": 0.0019,
      "num_tokens": 171187833.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2411
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.804,
      "grad_norm": 1.628396750774641e-09,
      "kl": 0.04803466796875,
      "learning_rate": 2.257914351978422e-06,
      "loss": 0.0019,
      "num_tokens": 171262921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2412
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8043333333333333,
      "grad_norm": 1.436234797758118e-09,
      "kl": 0.04364013671875,
      "learning_rate": 2.2505551129582047e-06,
      "loss": 0.0017,
      "num_tokens": 171336969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2413
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8046666666666666,
      "grad_norm": 1.533853821733544e-09,
      "kl": 0.04498291015625,
      "learning_rate": 2.2432063655630555e-06,
      "loss": 0.0018,
      "num_tokens": 171413129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2414
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.805,
      "grad_norm": 1.2900661650050438e-09,
      "kl": 0.05133056640625,
      "learning_rate": 2.2358681197421094e-06,
      "loss": 0.0021,
      "num_tokens": 171489225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2415
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8053333333333333,
      "grad_norm": 1.3863969972049972e-09,
      "kl": 0.04571533203125,
      "learning_rate": 2.2285403854302912e-06,
      "loss": 0.0018,
      "num_tokens": 171562409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2416
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8056666666666666,
      "grad_norm": 2.0053567695299535e-09,
      "kl": 0.04522705078125,
      "learning_rate": 2.2212231725482914e-06,
      "loss": 0.0018,
      "num_tokens": 171638121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2417
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.806,
      "grad_norm": 1.0757739143940626e-09,
      "kl": 0.04119873046875,
      "learning_rate": 2.213916491002551e-06,
      "loss": 0.0016,
      "num_tokens": 171712617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2418
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8063333333333333,
      "grad_norm": 1.1906510222203792e-09,
      "kl": 0.04400634765625,
      "learning_rate": 2.206620350685257e-06,
      "loss": 0.0018,
      "num_tokens": 171788137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2419
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8066666666666666,
      "grad_norm": 2.6464888058796987e-09,
      "kl": 0.044921875,
      "learning_rate": 2.1993347614743355e-06,
      "loss": 0.0018,
      "num_tokens": 171868009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2420
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.807,
      "grad_norm": 1.6010388570464329e-09,
      "kl": 0.04620361328125,
      "learning_rate": 2.192059733233408e-06,
      "loss": 0.0019,
      "num_tokens": 171945033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2421
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8073333333333333,
      "grad_norm": 1.1750415085387544e-09,
      "kl": 0.04608154296875,
      "learning_rate": 2.1847952758118118e-06,
      "loss": 0.0018,
      "num_tokens": 172019161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2422
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8076666666666666,
      "grad_norm": 1.718983289045184e-09,
      "kl": 0.0462646484375,
      "learning_rate": 2.177541399044573e-06,
      "loss": 0.0018,
      "num_tokens": 172096217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2423
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.808,
      "grad_norm": 3.139448923050736e-09,
      "kl": 0.03826904296875,
      "learning_rate": 2.1702981127523827e-06,
      "loss": 0.0015,
      "num_tokens": 172178825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2424
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8083333333333333,
      "grad_norm": 1.327751797397525e-09,
      "kl": 0.042236328125,
      "learning_rate": 2.163065426741603e-06,
      "loss": 0.0017,
      "num_tokens": 172252041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2425
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8086666666666666,
      "grad_norm": 2.454078273927962e-09,
      "kl": 0.04833984375,
      "learning_rate": 2.155843350804243e-06,
      "loss": 0.0019,
      "num_tokens": 172329273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2426
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.809,
      "grad_norm": 2.9387117184143108e-09,
      "kl": 0.041015625,
      "learning_rate": 2.1486318947179476e-06,
      "loss": 0.0016,
      "num_tokens": 172408441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2427
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8093333333333333,
      "grad_norm": 1.4199103004486346e-09,
      "kl": 0.04315185546875,
      "learning_rate": 2.1414310682459805e-06,
      "loss": 0.0017,
      "num_tokens": 172489289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2428
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8096666666666666,
      "grad_norm": 2.194127768362364e-09,
      "kl": 0.04583740234375,
      "learning_rate": 2.1342408811372217e-06,
      "loss": 0.0018,
      "num_tokens": 172564889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2429
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.81,
      "grad_norm": 1.92905491580575e-09,
      "kl": 0.04998779296875,
      "learning_rate": 2.1270613431261367e-06,
      "loss": 0.002,
      "num_tokens": 172640985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2430
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8103333333333333,
      "grad_norm": 3.520464808559609e-09,
      "kl": 0.04705810546875,
      "learning_rate": 2.119892463932781e-06,
      "loss": 0.0019,
      "num_tokens": 172717913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2431
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8106666666666666,
      "grad_norm": 2.200611470826175e-09,
      "kl": 0.04595947265625,
      "learning_rate": 2.1127342532627794e-06,
      "loss": 0.0018,
      "num_tokens": 172795289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2432
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.811,
      "grad_norm": 1.505857216699269e-09,
      "kl": 0.0458984375,
      "learning_rate": 2.10558672080731e-06,
      "loss": 0.0018,
      "num_tokens": 172869497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2433
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8113333333333334,
      "grad_norm": 1.1992862258836112e-09,
      "kl": 0.0460205078125,
      "learning_rate": 2.098449876243096e-06,
      "loss": 0.0018,
      "num_tokens": 172944249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2434
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8116666666666666,
      "grad_norm": 1.9396788619729932e-09,
      "kl": 0.04730224609375,
      "learning_rate": 2.091323729232391e-06,
      "loss": 0.0019,
      "num_tokens": 173019625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2435
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.812,
      "grad_norm": 2.285576394811528e-09,
      "kl": 0.04571533203125,
      "learning_rate": 2.084208289422968e-06,
      "loss": 0.0018,
      "num_tokens": 173097305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2436
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8123333333333334,
      "grad_norm": 1.8372916521514071e-09,
      "kl": 0.0447998046875,
      "learning_rate": 2.0771035664480944e-06,
      "loss": 0.0018,
      "num_tokens": 173173145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2437
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8126666666666666,
      "grad_norm": 2.9534956702548243e-09,
      "kl": 0.0400390625,
      "learning_rate": 2.070009569926539e-06,
      "loss": 0.0016,
      "num_tokens": 173251737.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2438
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.813,
      "grad_norm": 1.6942774960782003e-09,
      "kl": 0.0478515625,
      "learning_rate": 2.0629263094625476e-06,
      "loss": 0.0019,
      "num_tokens": 173327369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2439
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8133333333333334,
      "grad_norm": 1.3462093662042207e-09,
      "kl": 0.046630859375,
      "learning_rate": 2.0558537946458177e-06,
      "loss": 0.0019,
      "num_tokens": 173402409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2440
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8136666666666666,
      "grad_norm": 1.0766267877215796e-09,
      "kl": 0.043701171875,
      "learning_rate": 2.048792035051521e-06,
      "loss": 0.0017,
      "num_tokens": 173475225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2441
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.814,
      "grad_norm": 1.3438288259948195e-09,
      "kl": 0.04754638671875,
      "learning_rate": 2.041741040240255e-06,
      "loss": 0.0019,
      "num_tokens": 173550889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2442
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8143333333333334,
      "grad_norm": 9.137198775377442e-10,
      "kl": 0.0440673828125,
      "learning_rate": 2.0347008197580376e-06,
      "loss": 0.0018,
      "num_tokens": 173623337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2443
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8146666666666667,
      "grad_norm": 1.3256589159738041e-09,
      "kl": 0.04620361328125,
      "learning_rate": 2.0276713831363115e-06,
      "loss": 0.0018,
      "num_tokens": 173697769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2444
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.815,
      "grad_norm": 1.99012784030117e-09,
      "kl": 0.047119140625,
      "learning_rate": 2.020652739891914e-06,
      "loss": 0.0019,
      "num_tokens": 173777913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2445
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8153333333333334,
      "grad_norm": 1.1917519193715975e-09,
      "kl": 0.04669189453125,
      "learning_rate": 2.013644899527074e-06,
      "loss": 0.0019,
      "num_tokens": 173852729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2446
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8156666666666667,
      "grad_norm": 1.3997284442623936e-09,
      "kl": 0.04888916015625,
      "learning_rate": 2.0066478715293826e-06,
      "loss": 0.002,
      "num_tokens": 173928073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2447
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.816,
      "grad_norm": 1.4483105825746634e-09,
      "kl": 0.0443115234375,
      "learning_rate": 1.9996616653718126e-06,
      "loss": 0.0018,
      "num_tokens": 174001305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2448
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8163333333333334,
      "grad_norm": 1.1990284320972933e-09,
      "kl": 0.04388427734375,
      "learning_rate": 1.9926862905126663e-06,
      "loss": 0.0018,
      "num_tokens": 174077577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2449
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8166666666666667,
      "grad_norm": 1.0822991391989945e-09,
      "kl": 0.04290771484375,
      "learning_rate": 1.9857217563955932e-06,
      "loss": 0.0017,
      "num_tokens": 174152921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2450
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.817,
      "grad_norm": 1.5778539586008833e-09,
      "kl": 0.0482177734375,
      "learning_rate": 1.9787680724495617e-06,
      "loss": 0.0019,
      "num_tokens": 174227993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2451
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8173333333333334,
      "grad_norm": 1.4429178962771516e-09,
      "kl": 0.04400634765625,
      "learning_rate": 1.9718252480888567e-06,
      "loss": 0.0018,
      "num_tokens": 174302137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2452
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8176666666666667,
      "grad_norm": 2.0569510539303337e-09,
      "kl": 0.0455322265625,
      "learning_rate": 1.964893292713049e-06,
      "loss": 0.0018,
      "num_tokens": 174379369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2453
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.818,
      "grad_norm": 1.238439573114647e-09,
      "kl": 0.05059814453125,
      "learning_rate": 1.9579722157070026e-06,
      "loss": 0.002,
      "num_tokens": 174455305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2454
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8183333333333334,
      "grad_norm": 1.5458189173145342e-09,
      "kl": 0.04705810546875,
      "learning_rate": 1.95106202644086e-06,
      "loss": 0.0019,
      "num_tokens": 174531097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2455
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8186666666666667,
      "grad_norm": 2.202642512827424e-09,
      "kl": 0.04974365234375,
      "learning_rate": 1.9441627342700067e-06,
      "loss": 0.002,
      "num_tokens": 174606793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2456
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.819,
      "grad_norm": 1.2999635812249721e-09,
      "kl": 0.04541015625,
      "learning_rate": 1.9372743485350887e-06,
      "loss": 0.0018,
      "num_tokens": 174682217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2457
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8193333333333334,
      "grad_norm": 9.890632757247886e-10,
      "kl": 0.04913330078125,
      "learning_rate": 1.930396878561983e-06,
      "loss": 0.002,
      "num_tokens": 174756841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2458
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8196666666666667,
      "grad_norm": 1.257830506418145e-09,
      "kl": 0.04547119140625,
      "learning_rate": 1.9235303336617827e-06,
      "loss": 0.0018,
      "num_tokens": 174830681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2459
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.82,
      "grad_norm": 2.058286208139748e-09,
      "kl": 0.04510498046875,
      "learning_rate": 1.916674723130796e-06,
      "loss": 0.0018,
      "num_tokens": 174906809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2460
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8203333333333334,
      "grad_norm": 1.7233940940997172e-09,
      "kl": 0.045166015625,
      "learning_rate": 1.9098300562505266e-06,
      "loss": 0.0018,
      "num_tokens": 174987689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2461
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8206666666666667,
      "grad_norm": 2.0414827606174413e-09,
      "kl": 0.04742431640625,
      "learning_rate": 1.9029963422876608e-06,
      "loss": 0.0019,
      "num_tokens": 175064297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2462
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.821,
      "grad_norm": 2.7579463157678674e-09,
      "kl": 0.0477294921875,
      "learning_rate": 1.896173590494057e-06,
      "loss": 0.0019,
      "num_tokens": 175141065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2463
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8213333333333334,
      "grad_norm": 1.9415296037550434e-09,
      "kl": 0.04522705078125,
      "learning_rate": 1.8893618101067357e-06,
      "loss": 0.0018,
      "num_tokens": 175218089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2464
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8216666666666667,
      "grad_norm": 1.0607219547154045e-09,
      "kl": 0.04754638671875,
      "learning_rate": 1.8825610103478531e-06,
      "loss": 0.0019,
      "num_tokens": 175291753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2465
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.822,
      "grad_norm": 1.4194794228927776e-09,
      "kl": 0.04461669921875,
      "learning_rate": 1.8757712004247098e-06,
      "loss": 0.0018,
      "num_tokens": 175365529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2466
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8223333333333334,
      "grad_norm": 1.5218216686818664e-09,
      "kl": 0.04693603515625,
      "learning_rate": 1.8689923895297247e-06,
      "loss": 0.0019,
      "num_tokens": 175441033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2467
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8226666666666667,
      "grad_norm": 1.8422288139419152e-09,
      "kl": 0.0443115234375,
      "learning_rate": 1.8622245868404244e-06,
      "loss": 0.0018,
      "num_tokens": 175516281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2468
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.823,
      "grad_norm": 1.6611644282349403e-09,
      "kl": 0.046142578125,
      "learning_rate": 1.8554678015194316e-06,
      "loss": 0.0018,
      "num_tokens": 175592457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2469
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8233333333333334,
      "grad_norm": 1.45108991489451e-09,
      "kl": 0.04278564453125,
      "learning_rate": 1.848722042714457e-06,
      "loss": 0.0017,
      "num_tokens": 175670009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2470
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8236666666666667,
      "grad_norm": 1.0495018187839378e-09,
      "kl": 0.04400634765625,
      "learning_rate": 1.8419873195582815e-06,
      "loss": 0.0018,
      "num_tokens": 175746457.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2471
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.824,
      "grad_norm": 1.646014213818603e-09,
      "kl": 0.046630859375,
      "learning_rate": 1.8352636411687374e-06,
      "loss": 0.0019,
      "num_tokens": 175824201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2472
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8243333333333334,
      "grad_norm": 1.6282787340671234e-09,
      "kl": 0.04693603515625,
      "learning_rate": 1.8285510166487154e-06,
      "loss": 0.0019,
      "num_tokens": 175897929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2473
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8246666666666667,
      "grad_norm": 2.00043515086179e-09,
      "kl": 0.0482177734375,
      "learning_rate": 1.8218494550861375e-06,
      "loss": 0.0019,
      "num_tokens": 175972937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2474
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.825,
      "grad_norm": 1.5168132305731774e-09,
      "kl": 0.046875,
      "learning_rate": 1.8151589655539391e-06,
      "loss": 0.0019,
      "num_tokens": 176047913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2475
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8253333333333334,
      "grad_norm": 2.0142749690421624e-09,
      "kl": 0.0452880859375,
      "learning_rate": 1.808479557110081e-06,
      "loss": 0.0018,
      "num_tokens": 176123241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2476
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8256666666666667,
      "grad_norm": 9.166812864336293e-10,
      "kl": 0.04437255859375,
      "learning_rate": 1.8018112387975139e-06,
      "loss": 0.0018,
      "num_tokens": 176197161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2477
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.826,
      "grad_norm": 1.8321856254388535e-09,
      "kl": 0.04608154296875,
      "learning_rate": 1.7951540196441698e-06,
      "loss": 0.0018,
      "num_tokens": 176273545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2478
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8263333333333334,
      "grad_norm": 1.1298240121249137e-09,
      "kl": 0.0419921875,
      "learning_rate": 1.7885079086629598e-06,
      "loss": 0.0017,
      "num_tokens": 176348169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2479
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8266666666666667,
      "grad_norm": 1.8433952142515864e-09,
      "kl": 0.0482177734375,
      "learning_rate": 1.7818729148517588e-06,
      "loss": 0.0019,
      "num_tokens": 176424809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2480
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.827,
      "grad_norm": 1.3199487058912496e-09,
      "kl": 0.0498046875,
      "learning_rate": 1.7752490471933769e-06,
      "loss": 0.002,
      "num_tokens": 176499145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2481
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8273333333333334,
      "grad_norm": 1.266582394521265e-09,
      "kl": 0.047119140625,
      "learning_rate": 1.7686363146555807e-06,
      "loss": 0.0019,
      "num_tokens": 176572665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2482
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8276666666666667,
      "grad_norm": 2.000862364681666e-09,
      "kl": 0.04522705078125,
      "learning_rate": 1.7620347261910498e-06,
      "loss": 0.0018,
      "num_tokens": 176648873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2483
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.828,
      "grad_norm": 1.9733867873128474e-09,
      "kl": 0.04534912109375,
      "learning_rate": 1.7554442907373736e-06,
      "loss": 0.0018,
      "num_tokens": 176724505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2484
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8283333333333334,
      "grad_norm": 1.661237813976868e-09,
      "kl": 0.04498291015625,
      "learning_rate": 1.7488650172170496e-06,
      "loss": 0.0018,
      "num_tokens": 176800857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2485
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8286666666666667,
      "grad_norm": 1.7345337388618987e-09,
      "kl": 0.0445556640625,
      "learning_rate": 1.742296914537459e-06,
      "loss": 0.0018,
      "num_tokens": 176875801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2486
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.829,
      "grad_norm": 1.6666518165564526e-09,
      "kl": 0.0455322265625,
      "learning_rate": 1.7357399915908646e-06,
      "loss": 0.0018,
      "num_tokens": 176950425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2487
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8293333333333334,
      "grad_norm": 1.1203760141853536e-09,
      "kl": 0.044677734375,
      "learning_rate": 1.7291942572543806e-06,
      "loss": 0.0018,
      "num_tokens": 177024665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2488
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8296666666666667,
      "grad_norm": 1.3555520039787439e-09,
      "kl": 0.04595947265625,
      "learning_rate": 1.7226597203899941e-06,
      "loss": 0.0018,
      "num_tokens": 177099049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2489
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.83,
      "grad_norm": 2.691412426258921e-09,
      "kl": 0.04595947265625,
      "learning_rate": 1.7161363898445138e-06,
      "loss": 0.0018,
      "num_tokens": 177175913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2490
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8303333333333334,
      "grad_norm": 1.6293999482996924e-09,
      "kl": 0.04681396484375,
      "learning_rate": 1.709624274449584e-06,
      "loss": 0.0019,
      "num_tokens": 177250521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2491
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8306666666666667,
      "grad_norm": 2.158119905004696e-09,
      "kl": 0.04449462890625,
      "learning_rate": 1.7031233830216653e-06,
      "loss": 0.0018,
      "num_tokens": 177329273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2492
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.831,
      "grad_norm": 2.01321581627667e-09,
      "kl": 0.0457763671875,
      "learning_rate": 1.6966337243620267e-06,
      "loss": 0.0018,
      "num_tokens": 177406169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2493
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8313333333333334,
      "grad_norm": 1.530138238337031e-09,
      "kl": 0.0447998046875,
      "learning_rate": 1.6901553072567189e-06,
      "loss": 0.0018,
      "num_tokens": 177484841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2494
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8316666666666667,
      "grad_norm": 1.9579078358589186e-09,
      "kl": 0.04583740234375,
      "learning_rate": 1.6836881404765793e-06,
      "loss": 0.0018,
      "num_tokens": 177565001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2495
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.832,
      "grad_norm": 2.626694195484447e-09,
      "kl": 0.04620361328125,
      "learning_rate": 1.677232232777224e-06,
      "loss": 0.0018,
      "num_tokens": 177640665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2496
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8323333333333334,
      "grad_norm": 1.175638364436793e-09,
      "kl": 0.05035400390625,
      "learning_rate": 1.6707875928990059e-06,
      "loss": 0.002,
      "num_tokens": 177715865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2497
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8326666666666667,
      "grad_norm": 1.830399609659139e-09,
      "kl": 0.0457763671875,
      "learning_rate": 1.6643542295670367e-06,
      "loss": 0.0018,
      "num_tokens": 177790857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2498
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.833,
      "grad_norm": 1.662479043318399e-09,
      "kl": 0.042724609375,
      "learning_rate": 1.6579321514911606e-06,
      "loss": 0.0017,
      "num_tokens": 177866121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2499
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8333333333333334,
      "grad_norm": 1.0961127561159856e-09,
      "kl": 0.04705810546875,
      "learning_rate": 1.651521367365936e-06,
      "loss": 0.0019,
      "num_tokens": 177939785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2500
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8336666666666667,
      "grad_norm": 1.660401038883208e-09,
      "kl": 0.04241943359375,
      "learning_rate": 1.6451218858706374e-06,
      "loss": 0.0017,
      "num_tokens": 178016681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2501
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.834,
      "grad_norm": 1.316000863837985e-09,
      "kl": 0.04620361328125,
      "learning_rate": 1.638733715669234e-06,
      "loss": 0.0018,
      "num_tokens": 178091945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2502
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8343333333333334,
      "grad_norm": 2.3309589813891307e-09,
      "kl": 0.044921875,
      "learning_rate": 1.6323568654103838e-06,
      "loss": 0.0018,
      "num_tokens": 178169945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2503
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8346666666666667,
      "grad_norm": 1.5030010569461183e-09,
      "kl": 0.04351806640625,
      "learning_rate": 1.6259913437274167e-06,
      "loss": 0.0017,
      "num_tokens": 178245561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2504
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.835,
      "grad_norm": 1.5137225917172259e-09,
      "kl": 0.04864501953125,
      "learning_rate": 1.6196371592383264e-06,
      "loss": 0.0019,
      "num_tokens": 178321113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2505
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8353333333333334,
      "grad_norm": 1.3590236713767467e-09,
      "kl": 0.04608154296875,
      "learning_rate": 1.6132943205457607e-06,
      "loss": 0.0018,
      "num_tokens": 178397753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2506
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8356666666666667,
      "grad_norm": 1.163644514079465e-09,
      "kl": 0.04473876953125,
      "learning_rate": 1.6069628362369993e-06,
      "loss": 0.0018,
      "num_tokens": 178471545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2507
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.836,
      "grad_norm": 1.188214304725932e-09,
      "kl": 0.04632568359375,
      "learning_rate": 1.6006427148839554e-06,
      "loss": 0.0019,
      "num_tokens": 178545225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2508
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8363333333333334,
      "grad_norm": 2.5142559145763244e-09,
      "kl": 0.04656982421875,
      "learning_rate": 1.5943339650431578e-06,
      "loss": 0.0019,
      "num_tokens": 178622825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2509
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8366666666666667,
      "grad_norm": 1.368064106443967e-09,
      "kl": 0.045166015625,
      "learning_rate": 1.5880365952557387e-06,
      "loss": 0.0018,
      "num_tokens": 178699401.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2510
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.837,
      "grad_norm": 2.8552809006043844e-09,
      "kl": 0.04779052734375,
      "learning_rate": 1.5817506140474248e-06,
      "loss": 0.0019,
      "num_tokens": 178779993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2511
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8373333333333334,
      "grad_norm": 1.5192601621194513e-09,
      "kl": 0.0435791015625,
      "learning_rate": 1.5754760299285255e-06,
      "loss": 0.0017,
      "num_tokens": 178854265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2512
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8376666666666667,
      "grad_norm": 1.8722465799925203e-09,
      "kl": 0.04888916015625,
      "learning_rate": 1.5692128513939142e-06,
      "loss": 0.002,
      "num_tokens": 178930073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2513
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.838,
      "grad_norm": 3.5012095445097202e-09,
      "kl": 0.04913330078125,
      "learning_rate": 1.5629610869230272e-06,
      "loss": 0.002,
      "num_tokens": 179008233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2514
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8383333333333334,
      "grad_norm": 2.2937911570153346e-09,
      "kl": 0.04669189453125,
      "learning_rate": 1.5567207449798517e-06,
      "loss": 0.0019,
      "num_tokens": 179085193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2515
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8386666666666667,
      "grad_norm": 1.9419670316267457e-09,
      "kl": 0.0465087890625,
      "learning_rate": 1.5504918340128982e-06,
      "loss": 0.0019,
      "num_tokens": 179162873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2516
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.839,
      "grad_norm": 1.2155141337899522e-09,
      "kl": 0.04901123046875,
      "learning_rate": 1.544274362455216e-06,
      "loss": 0.002,
      "num_tokens": 179238377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2517
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8393333333333334,
      "grad_norm": 2.5740238829286e-09,
      "kl": 0.0487060546875,
      "learning_rate": 1.538068338724361e-06,
      "loss": 0.0019,
      "num_tokens": 179314825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2518
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8396666666666667,
      "grad_norm": 1.0880418788161705e-09,
      "kl": 0.0439453125,
      "learning_rate": 1.5318737712223853e-06,
      "loss": 0.0018,
      "num_tokens": 179389625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2519
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.84,
      "grad_norm": 1.2253025261088624e-09,
      "kl": 0.04290771484375,
      "learning_rate": 1.5256906683358364e-06,
      "loss": 0.0017,
      "num_tokens": 179464521.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2520
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8403333333333334,
      "grad_norm": 2.1616044509897847e-09,
      "kl": 0.0482177734375,
      "learning_rate": 1.5195190384357405e-06,
      "loss": 0.0019,
      "num_tokens": 179541609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2521
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8406666666666667,
      "grad_norm": 2.509201069145206e-09,
      "kl": 0.04925537109375,
      "learning_rate": 1.513358889877592e-06,
      "loss": 0.002,
      "num_tokens": 179617593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2522
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.841,
      "grad_norm": 1.9853581001427756e-09,
      "kl": 0.04693603515625,
      "learning_rate": 1.5072102310013314e-06,
      "loss": 0.0019,
      "num_tokens": 179695113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2523
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8413333333333334,
      "grad_norm": 1.2919806335887074e-09,
      "kl": 0.04595947265625,
      "learning_rate": 1.5010730701313626e-06,
      "loss": 0.0018,
      "num_tokens": 179769481.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2524
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8416666666666667,
      "grad_norm": 1.0720540011277535e-09,
      "kl": 0.0447998046875,
      "learning_rate": 1.494947415576502e-06,
      "loss": 0.0018,
      "num_tokens": 179844121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2525
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.842,
      "grad_norm": 2.1835697694427836e-09,
      "kl": 0.04864501953125,
      "learning_rate": 1.4888332756300027e-06,
      "loss": 0.0019,
      "num_tokens": 179921577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2526
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8423333333333334,
      "grad_norm": 2.063119675099756e-09,
      "kl": 0.0457763671875,
      "learning_rate": 1.4827306585695234e-06,
      "loss": 0.0018,
      "num_tokens": 179997225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2527
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8426666666666667,
      "grad_norm": 2.2825248358060435e-09,
      "kl": 0.04632568359375,
      "learning_rate": 1.4766395726571258e-06,
      "loss": 0.0019,
      "num_tokens": 180073641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2528
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.843,
      "grad_norm": 1.7258594553481998e-09,
      "kl": 0.04486083984375,
      "learning_rate": 1.4705600261392505e-06,
      "loss": 0.0018,
      "num_tokens": 180154761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2529
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8433333333333334,
      "grad_norm": 1.3981876767488188e-09,
      "kl": 0.04888916015625,
      "learning_rate": 1.4644920272467245e-06,
      "loss": 0.002,
      "num_tokens": 180228681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2530
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8436666666666667,
      "grad_norm": 2.0124735211624056e-09,
      "kl": 0.04620361328125,
      "learning_rate": 1.4584355841947452e-06,
      "loss": 0.0019,
      "num_tokens": 180303625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2531
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.844,
      "grad_norm": 1.558583484495557e-09,
      "kl": 0.04425048828125,
      "learning_rate": 1.4523907051828502e-06,
      "loss": 0.0018,
      "num_tokens": 180378153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2532
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8443333333333334,
      "grad_norm": 1.2809792115930918e-09,
      "kl": 0.0445556640625,
      "learning_rate": 1.446357398394934e-06,
      "loss": 0.0018,
      "num_tokens": 180452937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2533
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8446666666666667,
      "grad_norm": 1.6429358984382247e-09,
      "kl": 0.0430908203125,
      "learning_rate": 1.4403356719992201e-06,
      "loss": 0.0017,
      "num_tokens": 180527961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2534
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.845,
      "grad_norm": 1.6691761306475428e-09,
      "kl": 0.04339599609375,
      "learning_rate": 1.4343255341482486e-06,
      "loss": 0.0017,
      "num_tokens": 180603529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2535
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8453333333333334,
      "grad_norm": 1.4524653701997181e-09,
      "kl": 0.04449462890625,
      "learning_rate": 1.4283269929788779e-06,
      "loss": 0.0018,
      "num_tokens": 180677929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2536
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8456666666666667,
      "grad_norm": 1.9534622808237145e-09,
      "kl": 0.0458984375,
      "learning_rate": 1.4223400566122635e-06,
      "loss": 0.0018,
      "num_tokens": 180752777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2537
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.846,
      "grad_norm": 1.9489254654558863e-09,
      "kl": 0.048095703125,
      "learning_rate": 1.416364733153849e-06,
      "loss": 0.0019,
      "num_tokens": 180828233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2538
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8463333333333334,
      "grad_norm": 1.6075731856801667e-09,
      "kl": 0.0465087890625,
      "learning_rate": 1.4104010306933558e-06,
      "loss": 0.0019,
      "num_tokens": 180905609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2539
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8466666666666667,
      "grad_norm": 1.5459985513999186e-09,
      "kl": 0.0455322265625,
      "learning_rate": 1.4044489573047759e-06,
      "loss": 0.0018,
      "num_tokens": 180983625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2540
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.847,
      "grad_norm": 1.1015142131753919e-09,
      "kl": 0.04754638671875,
      "learning_rate": 1.3985085210463479e-06,
      "loss": 0.0019,
      "num_tokens": 181058265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2541
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8473333333333334,
      "grad_norm": 1.4479096810404712e-09,
      "kl": 0.04193115234375,
      "learning_rate": 1.3925797299605649e-06,
      "loss": 0.0017,
      "num_tokens": 181133993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2542
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8476666666666667,
      "grad_norm": 1.9511812165973197e-09,
      "kl": 0.04815673828125,
      "learning_rate": 1.3866625920741495e-06,
      "loss": 0.0019,
      "num_tokens": 181210969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2543
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.848,
      "grad_norm": 1.7412599140342877e-09,
      "kl": 0.04486083984375,
      "learning_rate": 1.3807571153980504e-06,
      "loss": 0.0018,
      "num_tokens": 181286105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2544
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8483333333333334,
      "grad_norm": 1.3836660706090242e-09,
      "kl": 0.04278564453125,
      "learning_rate": 1.3748633079274254e-06,
      "loss": 0.0017,
      "num_tokens": 181362201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2545
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8486666666666667,
      "grad_norm": 1.5559509236595659e-09,
      "kl": 0.0435791015625,
      "learning_rate": 1.368981177641636e-06,
      "loss": 0.0017,
      "num_tokens": 181439593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2546
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.849,
      "grad_norm": 1.6364407606772602e-09,
      "kl": 0.0433349609375,
      "learning_rate": 1.363110732504237e-06,
      "loss": 0.0017,
      "num_tokens": 181515721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2547
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8493333333333334,
      "grad_norm": 2.5346389431746275e-09,
      "kl": 0.04779052734375,
      "learning_rate": 1.3572519804629537e-06,
      "loss": 0.0019,
      "num_tokens": 181594393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2548
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8496666666666667,
      "grad_norm": 8.960080455366892e-10,
      "kl": 0.04595947265625,
      "learning_rate": 1.3514049294496911e-06,
      "loss": 0.0018,
      "num_tokens": 181671049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2549
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.85,
      "grad_norm": 1.3987283553618113e-09,
      "kl": 0.04547119140625,
      "learning_rate": 1.3455695873805086e-06,
      "loss": 0.0018,
      "num_tokens": 181745385.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2550
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8503333333333334,
      "grad_norm": 1.5169235867418251e-09,
      "kl": 0.043701171875,
      "learning_rate": 1.339745962155613e-06,
      "loss": 0.0017,
      "num_tokens": 181821433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2551
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8506666666666667,
      "grad_norm": 1.0315400755800397e-09,
      "kl": 0.045654296875,
      "learning_rate": 1.3339340616593487e-06,
      "loss": 0.0018,
      "num_tokens": 181895337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2552
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.851,
      "grad_norm": 1.4944254722593087e-09,
      "kl": 0.04644775390625,
      "learning_rate": 1.3281338937601895e-06,
      "loss": 0.0019,
      "num_tokens": 181969641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2553
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8513333333333334,
      "grad_norm": 2.0457020522002267e-09,
      "kl": 0.047607421875,
      "learning_rate": 1.322345466310717e-06,
      "loss": 0.0019,
      "num_tokens": 182045561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2554
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8516666666666667,
      "grad_norm": 1.4869813158568945e-09,
      "kl": 0.0462646484375,
      "learning_rate": 1.316568787147624e-06,
      "loss": 0.0018,
      "num_tokens": 182122073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2555
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.852,
      "grad_norm": 1.9542290008445207e-09,
      "kl": 0.0469970703125,
      "learning_rate": 1.3108038640916988e-06,
      "loss": 0.0019,
      "num_tokens": 182196057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2556
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8523333333333334,
      "grad_norm": 8.708965770765076e-10,
      "kl": 0.040771484375,
      "learning_rate": 1.30505070494781e-06,
      "loss": 0.0016,
      "num_tokens": 182270121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2557
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8526666666666667,
      "grad_norm": 1.6140492276051077e-09,
      "kl": 0.04754638671875,
      "learning_rate": 1.2993093175049022e-06,
      "loss": 0.0019,
      "num_tokens": 182346313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2558
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.853,
      "grad_norm": 1.5871274294809723e-09,
      "kl": 0.047119140625,
      "learning_rate": 1.2935797095359825e-06,
      "loss": 0.0019,
      "num_tokens": 182421465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2559
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8533333333333334,
      "grad_norm": 1.9031267672886543e-09,
      "kl": 0.0439453125,
      "learning_rate": 1.2878618887981064e-06,
      "loss": 0.0018,
      "num_tokens": 182497305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2560
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8536666666666667,
      "grad_norm": 2.1138784056518034e-09,
      "kl": 0.04833984375,
      "learning_rate": 1.282155863032377e-06,
      "loss": 0.0019,
      "num_tokens": 182573881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2561
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.854,
      "grad_norm": 1.5848523604589104e-09,
      "kl": 0.0474853515625,
      "learning_rate": 1.2764616399639252e-06,
      "loss": 0.0019,
      "num_tokens": 182650345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2562
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8543333333333333,
      "grad_norm": 1.354097167727275e-09,
      "kl": 0.04937744140625,
      "learning_rate": 1.2707792273019049e-06,
      "loss": 0.002,
      "num_tokens": 182725337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2563
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8546666666666667,
      "grad_norm": 1.7183370282225496e-09,
      "kl": 0.0455322265625,
      "learning_rate": 1.2651086327394745e-06,
      "loss": 0.0018,
      "num_tokens": 182801113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2564
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.855,
      "grad_norm": 2.0987287463469784e-09,
      "kl": 0.04620361328125,
      "learning_rate": 1.2594498639538032e-06,
      "loss": 0.0018,
      "num_tokens": 182876425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2565
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8553333333333333,
      "grad_norm": 2.755427441769598e-09,
      "kl": 0.0439453125,
      "learning_rate": 1.2538029286060428e-06,
      "loss": 0.0018,
      "num_tokens": 182952105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2566
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8556666666666667,
      "grad_norm": 1.520349401928911e-09,
      "kl": 0.04425048828125,
      "learning_rate": 1.2481678343413216e-06,
      "loss": 0.0018,
      "num_tokens": 183027625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2567
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.856,
      "grad_norm": 1.4083348931492878e-09,
      "kl": 0.04437255859375,
      "learning_rate": 1.2425445887887422e-06,
      "loss": 0.0018,
      "num_tokens": 183103353.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2568
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8563333333333333,
      "grad_norm": 2.595440973252039e-09,
      "kl": 0.0457763671875,
      "learning_rate": 1.2369331995613664e-06,
      "loss": 0.0018,
      "num_tokens": 183182729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2569
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8566666666666667,
      "grad_norm": 1.3948296961885376e-09,
      "kl": 0.044921875,
      "learning_rate": 1.2313336742561965e-06,
      "loss": 0.0018,
      "num_tokens": 183257769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2570
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.857,
      "grad_norm": 1.1847309799861705e-09,
      "kl": 0.04510498046875,
      "learning_rate": 1.2257460204541793e-06,
      "loss": 0.0018,
      "num_tokens": 183333721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2571
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8573333333333333,
      "grad_norm": 2.0406178968812583e-09,
      "kl": 0.04443359375,
      "learning_rate": 1.2201702457201948e-06,
      "loss": 0.0018,
      "num_tokens": 183407577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2572
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8576666666666667,
      "grad_norm": 1.4084077237797032e-09,
      "kl": 0.04486083984375,
      "learning_rate": 1.2146063576030265e-06,
      "loss": 0.0018,
      "num_tokens": 183484905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2573
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.858,
      "grad_norm": 1.9261217065746905e-09,
      "kl": 0.04296875,
      "learning_rate": 1.2090543636353746e-06,
      "loss": 0.0017,
      "num_tokens": 183561609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2574
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8583333333333333,
      "grad_norm": 1.1394422072541488e-09,
      "kl": 0.046142578125,
      "learning_rate": 1.2035142713338366e-06,
      "loss": 0.0018,
      "num_tokens": 183637225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2575
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8586666666666667,
      "grad_norm": 2.962248224491759e-09,
      "kl": 0.0479736328125,
      "learning_rate": 1.1979860881988903e-06,
      "loss": 0.0019,
      "num_tokens": 183715209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2576
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.859,
      "grad_norm": 1.272532967888651e-09,
      "kl": 0.045166015625,
      "learning_rate": 1.1924698217148955e-06,
      "loss": 0.0018,
      "num_tokens": 183790585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2577
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8593333333333333,
      "grad_norm": 2.0194408367757433e-09,
      "kl": 0.04486083984375,
      "learning_rate": 1.1869654793500784e-06,
      "loss": 0.0018,
      "num_tokens": 183865241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2578
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8596666666666667,
      "grad_norm": 2.5757318500296833e-09,
      "kl": 0.0457763671875,
      "learning_rate": 1.18147306855652e-06,
      "loss": 0.0018,
      "num_tokens": 183943097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2579
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.86,
      "grad_norm": 2.58563370714171e-09,
      "kl": 0.044189453125,
      "learning_rate": 1.1759925967701491e-06,
      "loss": 0.0018,
      "num_tokens": 184021593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2580
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8603333333333333,
      "grad_norm": 1.808248439871818e-09,
      "kl": 0.04376220703125,
      "learning_rate": 1.1705240714107301e-06,
      "loss": 0.0018,
      "num_tokens": 184096265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2581
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8606666666666667,
      "grad_norm": 1.5734378244758318e-09,
      "kl": 0.04351806640625,
      "learning_rate": 1.1650674998818556e-06,
      "loss": 0.0017,
      "num_tokens": 184170233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2582
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.861,
      "grad_norm": 1.3491952000066476e-09,
      "kl": 0.04541015625,
      "learning_rate": 1.159622889570927e-06,
      "loss": 0.0018,
      "num_tokens": 184246425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2583
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8613333333333333,
      "grad_norm": 1.0357107393943465e-09,
      "kl": 0.04217529296875,
      "learning_rate": 1.1541902478491607e-06,
      "loss": 0.0017,
      "num_tokens": 184323337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2584
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8616666666666667,
      "grad_norm": 1.6043160133705214e-09,
      "kl": 0.04803466796875,
      "learning_rate": 1.1487695820715672e-06,
      "loss": 0.0019,
      "num_tokens": 184398537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2585
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.862,
      "grad_norm": 1.5244179252249523e-09,
      "kl": 0.04388427734375,
      "learning_rate": 1.1433608995769396e-06,
      "loss": 0.0018,
      "num_tokens": 184473897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2586
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8623333333333333,
      "grad_norm": 9.53287337956965e-10,
      "kl": 0.04425048828125,
      "learning_rate": 1.1379642076878528e-06,
      "loss": 0.0018,
      "num_tokens": 184548905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2587
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8626666666666667,
      "grad_norm": 1.6500151245324446e-09,
      "kl": 0.04620361328125,
      "learning_rate": 1.1325795137106455e-06,
      "loss": 0.0018,
      "num_tokens": 184624041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2588
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.863,
      "grad_norm": 2.1974935204838175e-09,
      "kl": 0.04559326171875,
      "learning_rate": 1.1272068249354085e-06,
      "loss": 0.0018,
      "num_tokens": 184700393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2589
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8633333333333333,
      "grad_norm": 2.0920136734048356e-09,
      "kl": 0.04754638671875,
      "learning_rate": 1.1218461486359878e-06,
      "loss": 0.0019,
      "num_tokens": 184778697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2590
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8636666666666667,
      "grad_norm": 1.40344147414595e-09,
      "kl": 0.04840087890625,
      "learning_rate": 1.1164974920699611e-06,
      "loss": 0.0019,
      "num_tokens": 184853849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2591
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.864,
      "grad_norm": 1.3430054845997574e-09,
      "kl": 0.049560546875,
      "learning_rate": 1.1111608624786307e-06,
      "loss": 0.002,
      "num_tokens": 184928217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2592
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8643333333333333,
      "grad_norm": 1.3960522737832548e-09,
      "kl": 0.04669189453125,
      "learning_rate": 1.1058362670870248e-06,
      "loss": 0.0019,
      "num_tokens": 185004889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2593
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8646666666666667,
      "grad_norm": 1.172054120424093e-09,
      "kl": 0.0474853515625,
      "learning_rate": 1.1005237131038725e-06,
      "loss": 0.0019,
      "num_tokens": 185079417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2594
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.865,
      "grad_norm": 1.5231490513301083e-09,
      "kl": 0.04437255859375,
      "learning_rate": 1.0952232077215985e-06,
      "loss": 0.0018,
      "num_tokens": 185154041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2595
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8653333333333333,
      "grad_norm": 1.0784830806187529e-09,
      "kl": 0.0435791015625,
      "learning_rate": 1.0899347581163222e-06,
      "loss": 0.0017,
      "num_tokens": 185228393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2596
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8656666666666667,
      "grad_norm": 1.383213543704187e-09,
      "kl": 0.04345703125,
      "learning_rate": 1.0846583714478355e-06,
      "loss": 0.0017,
      "num_tokens": 185306633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2597
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.866,
      "grad_norm": 1.6527903490271e-09,
      "kl": 0.04644775390625,
      "learning_rate": 1.0793940548596048e-06,
      "loss": 0.0019,
      "num_tokens": 185382905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2598
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8663333333333333,
      "grad_norm": 1.1343456174373046e-09,
      "kl": 0.04541015625,
      "learning_rate": 1.0741418154787443e-06,
      "loss": 0.0018,
      "num_tokens": 185461705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2599
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8666666666666667,
      "grad_norm": 1.1936367450005037e-09,
      "kl": 0.04541015625,
      "learning_rate": 1.0689016604160341e-06,
      "loss": 0.0018,
      "num_tokens": 185536153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2600
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.867,
      "grad_norm": 1.4188243913082488e-09,
      "kl": 0.04827880859375,
      "learning_rate": 1.0636735967658785e-06,
      "loss": 0.0019,
      "num_tokens": 185611689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2601
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8673333333333333,
      "grad_norm": 1.69647040859644e-09,
      "kl": 0.04498291015625,
      "learning_rate": 1.058457631606319e-06,
      "loss": 0.0018,
      "num_tokens": 185688777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2602
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8676666666666667,
      "grad_norm": 1.7936276908159243e-09,
      "kl": 0.045654296875,
      "learning_rate": 1.0532537719990166e-06,
      "loss": 0.0018,
      "num_tokens": 185765897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2603
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.868,
      "grad_norm": 1.2953073058596942e-09,
      "kl": 0.04376220703125,
      "learning_rate": 1.0480620249892448e-06,
      "loss": 0.0018,
      "num_tokens": 185840137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2604
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8683333333333333,
      "grad_norm": 1.4768137823750749e-09,
      "kl": 0.04364013671875,
      "learning_rate": 1.042882397605871e-06,
      "loss": 0.0017,
      "num_tokens": 185918281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2605
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8686666666666667,
      "grad_norm": 1.893451395673651e-09,
      "kl": 0.0491943359375,
      "learning_rate": 1.0377148968613659e-06,
      "loss": 0.002,
      "num_tokens": 185996169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2606
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.869,
      "grad_norm": 1.5184105084387056e-09,
      "kl": 0.04473876953125,
      "learning_rate": 1.0325595297517753e-06,
      "loss": 0.0018,
      "num_tokens": 186071513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2607
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8693333333333333,
      "grad_norm": 1.0488743207304196e-09,
      "kl": 0.04449462890625,
      "learning_rate": 1.0274163032567165e-06,
      "loss": 0.0018,
      "num_tokens": 186150121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2608
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8696666666666667,
      "grad_norm": 2.3841635332644273e-09,
      "kl": 0.049560546875,
      "learning_rate": 1.0222852243393732e-06,
      "loss": 0.002,
      "num_tokens": 186225529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2609
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.87,
      "grad_norm": 3.7211060899267068e-09,
      "kl": 0.04638671875,
      "learning_rate": 1.017166299946486e-06,
      "loss": 0.0019,
      "num_tokens": 186304393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2610
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8703333333333333,
      "grad_norm": 1.3185385006053707e-09,
      "kl": 0.04705810546875,
      "learning_rate": 1.012059537008332e-06,
      "loss": 0.0019,
      "num_tokens": 186384697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2611
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8706666666666667,
      "grad_norm": 1.0482662515798324e-09,
      "kl": 0.0438232421875,
      "learning_rate": 1.0069649424387274e-06,
      "loss": 0.0018,
      "num_tokens": 186460153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2612
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.871,
      "grad_norm": 1.4424126337786447e-09,
      "kl": 0.0447998046875,
      "learning_rate": 1.0018825231350203e-06,
      "loss": 0.0018,
      "num_tokens": 186535689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2613
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8713333333333333,
      "grad_norm": 1.5386187879329327e-09,
      "kl": 0.04437255859375,
      "learning_rate": 9.968122859780648e-07,
      "loss": 0.0018,
      "num_tokens": 186610761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2614
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8716666666666667,
      "grad_norm": 1.954133299619798e-09,
      "kl": 0.0450439453125,
      "learning_rate": 9.917542378322299e-07,
      "loss": 0.0018,
      "num_tokens": 186688297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2615
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.872,
      "grad_norm": 2.2020136825062764e-09,
      "kl": 0.04656982421875,
      "learning_rate": 9.867083855453775e-07,
      "loss": 0.0019,
      "num_tokens": 186765817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2616
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8723333333333333,
      "grad_norm": 1.8144351576765416e-09,
      "kl": 0.0462646484375,
      "learning_rate": 9.816747359488632e-07,
      "loss": 0.0019,
      "num_tokens": 186841897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2617
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8726666666666667,
      "grad_norm": 2.402335663731492e-09,
      "kl": 0.04559326171875,
      "learning_rate": 9.766532958575158e-07,
      "loss": 0.0018,
      "num_tokens": 186916025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2618
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.873,
      "grad_norm": 1.4105491219496002e-09,
      "kl": 0.04754638671875,
      "learning_rate": 9.716440720696375e-07,
      "loss": 0.0019,
      "num_tokens": 186991257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2619
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8733333333333333,
      "grad_norm": 1.0827277963088022e-09,
      "kl": 0.04827880859375,
      "learning_rate": 9.666470713669918e-07,
      "loss": 0.0019,
      "num_tokens": 187065145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2620
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8736666666666667,
      "grad_norm": 1.6385162115994945e-09,
      "kl": 0.0472412109375,
      "learning_rate": 9.616623005147952e-07,
      "loss": 0.0019,
      "num_tokens": 187141993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2621
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.874,
      "grad_norm": 2.8021986953064015e-09,
      "kl": 0.0467529296875,
      "learning_rate": 9.566897662617014e-07,
      "loss": 0.0019,
      "num_tokens": 187218041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2622
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8743333333333333,
      "grad_norm": 1.4414750504343488e-09,
      "kl": 0.04541015625,
      "learning_rate": 9.517294753398066e-07,
      "loss": 0.0018,
      "num_tokens": 187293609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2623
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8746666666666667,
      "grad_norm": 1.9151751295964914e-09,
      "kl": 0.0458984375,
      "learning_rate": 9.467814344646187e-07,
      "loss": 0.0018,
      "num_tokens": 187370585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2624
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.875,
      "grad_norm": 1.572804664284888e-09,
      "kl": 0.047119140625,
      "learning_rate": 9.418456503350714e-07,
      "loss": 0.0019,
      "num_tokens": 187444041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2625
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8753333333333333,
      "grad_norm": 1.2705704266480211e-09,
      "kl": 0.0433349609375,
      "learning_rate": 9.369221296335007e-07,
      "loss": 0.0017,
      "num_tokens": 187519433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2626
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8756666666666667,
      "grad_norm": 2.4518764796255255e-09,
      "kl": 0.04833984375,
      "learning_rate": 9.320108790256399e-07,
      "loss": 0.0019,
      "num_tokens": 187598649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2627
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.876,
      "grad_norm": 8.608486146144401e-10,
      "kl": 0.0474853515625,
      "learning_rate": 9.271119051606103e-07,
      "loss": 0.0019,
      "num_tokens": 187671929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2628
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8763333333333333,
      "grad_norm": 1.4270256087911548e-09,
      "kl": 0.0474853515625,
      "learning_rate": 9.222252146709143e-07,
      "loss": 0.0019,
      "num_tokens": 187747209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2629
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8766666666666667,
      "grad_norm": 1.2907819257890196e-09,
      "kl": 0.0416259765625,
      "learning_rate": 9.173508141724197e-07,
      "loss": 0.0017,
      "num_tokens": 187823049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2630
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.877,
      "grad_norm": 1.7641250682487453e-09,
      "kl": 0.04388427734375,
      "learning_rate": 9.124887102643576e-07,
      "loss": 0.0018,
      "num_tokens": 187899369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2631
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8773333333333333,
      "grad_norm": 2.342011917733089e-09,
      "kl": 0.0478515625,
      "learning_rate": 9.076389095293148e-07,
      "loss": 0.0019,
      "num_tokens": 187981561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2632
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8776666666666667,
      "grad_norm": 9.404841350146853e-10,
      "kl": 0.042236328125,
      "learning_rate": 9.028014185332168e-07,
      "loss": 0.0017,
      "num_tokens": 188055641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2633
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.878,
      "grad_norm": 3.865779696354821e-09,
      "kl": 0.0428466796875,
      "learning_rate": 8.979762438253259e-07,
      "loss": 0.0017,
      "num_tokens": 188131641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2634
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8783333333333333,
      "grad_norm": 2.162332757293939e-09,
      "kl": 0.046630859375,
      "learning_rate": 8.931633919382299e-07,
      "loss": 0.0019,
      "num_tokens": 188210169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2635
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8786666666666667,
      "grad_norm": 2.2158062051857996e-09,
      "kl": 0.03955078125,
      "learning_rate": 8.883628693878299e-07,
      "loss": 0.0016,
      "num_tokens": 188286105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2636
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.879,
      "grad_norm": 1.310844544022416e-09,
      "kl": 0.0462646484375,
      "learning_rate": 8.835746826733404e-07,
      "loss": 0.0019,
      "num_tokens": 188360393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2637
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8793333333333333,
      "grad_norm": 1.2211397448780303e-09,
      "kl": 0.0443115234375,
      "learning_rate": 8.787988382772705e-07,
      "loss": 0.0018,
      "num_tokens": 188436729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2638
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8796666666666667,
      "grad_norm": 2.4066997283966884e-09,
      "kl": 0.05029296875,
      "learning_rate": 8.740353426654236e-07,
      "loss": 0.002,
      "num_tokens": 188515049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2639
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.88,
      "grad_norm": 1.4026209083084495e-09,
      "kl": 0.04486083984375,
      "learning_rate": 8.692842022868764e-07,
      "loss": 0.0018,
      "num_tokens": 188590969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2640
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8803333333333333,
      "grad_norm": 1.4049891250422775e-09,
      "kl": 0.04443359375,
      "learning_rate": 8.645454235739903e-07,
      "loss": 0.0018,
      "num_tokens": 188665465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2641
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8806666666666667,
      "grad_norm": 1.828218576527263e-09,
      "kl": 0.046142578125,
      "learning_rate": 8.598190129423844e-07,
      "loss": 0.0018,
      "num_tokens": 188742265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2642
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.881,
      "grad_norm": 1.9109716031806556e-09,
      "kl": 0.047119140625,
      "learning_rate": 8.551049767909314e-07,
      "loss": 0.0019,
      "num_tokens": 188817577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2643
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8813333333333333,
      "grad_norm": 2.4950443933136057e-09,
      "kl": 0.04620361328125,
      "learning_rate": 8.504033215017527e-07,
      "loss": 0.0018,
      "num_tokens": 188892169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2644
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8816666666666667,
      "grad_norm": 1.2899302737068297e-09,
      "kl": 0.04339599609375,
      "learning_rate": 8.457140534402098e-07,
      "loss": 0.0017,
      "num_tokens": 188969529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2645
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.882,
      "grad_norm": 1.6007293268671674e-09,
      "kl": 0.04376220703125,
      "learning_rate": 8.41037178954891e-07,
      "loss": 0.0017,
      "num_tokens": 189046265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2646
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8823333333333333,
      "grad_norm": 1.557202033986016e-09,
      "kl": 0.04937744140625,
      "learning_rate": 8.363727043776037e-07,
      "loss": 0.002,
      "num_tokens": 189123001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2647
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8826666666666667,
      "grad_norm": 1.2985638120355247e-09,
      "kl": 0.04388427734375,
      "learning_rate": 8.317206360233765e-07,
      "loss": 0.0018,
      "num_tokens": 189197801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2648
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.883,
      "grad_norm": 1.1025262924846402e-09,
      "kl": 0.04510498046875,
      "learning_rate": 8.270809801904301e-07,
      "loss": 0.0018,
      "num_tokens": 189271417.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2649
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8833333333333333,
      "grad_norm": 9.770765307948182e-10,
      "kl": 0.04339599609375,
      "learning_rate": 8.224537431601886e-07,
      "loss": 0.0017,
      "num_tokens": 189346841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2650
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8836666666666667,
      "grad_norm": 1.3961618527957853e-09,
      "kl": 0.04205322265625,
      "learning_rate": 8.178389311972612e-07,
      "loss": 0.0017,
      "num_tokens": 189422569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2651
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.884,
      "grad_norm": 2.638132601262555e-09,
      "kl": 0.0469970703125,
      "learning_rate": 8.13236550549431e-07,
      "loss": 0.0019,
      "num_tokens": 189500713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2652
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8843333333333333,
      "grad_norm": 1.4742652654220478e-09,
      "kl": 0.0465087890625,
      "learning_rate": 8.086466074476562e-07,
      "loss": 0.0019,
      "num_tokens": 189575305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2653
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8846666666666667,
      "grad_norm": 9.411876833453903e-10,
      "kl": 0.04901123046875,
      "learning_rate": 8.040691081060548e-07,
      "loss": 0.002,
      "num_tokens": 189649113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2654
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.885,
      "grad_norm": 1.8089580944291583e-09,
      "kl": 0.04638671875,
      "learning_rate": 7.99504058721896e-07,
      "loss": 0.0019,
      "num_tokens": 189726409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2655
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8853333333333333,
      "grad_norm": 1.2890151168676311e-09,
      "kl": 0.0472412109375,
      "learning_rate": 7.949514654755963e-07,
      "loss": 0.0019,
      "num_tokens": 189801529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2656
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8856666666666667,
      "grad_norm": 1.8693517844781127e-09,
      "kl": 0.0450439453125,
      "learning_rate": 7.904113345307073e-07,
      "loss": 0.0018,
      "num_tokens": 189879705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2657
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.886,
      "grad_norm": 1.2553108552637582e-09,
      "kl": 0.041748046875,
      "learning_rate": 7.85883672033908e-07,
      "loss": 0.0017,
      "num_tokens": 189953529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2658
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8863333333333333,
      "grad_norm": 1.6984924577911897e-09,
      "kl": 0.04644775390625,
      "learning_rate": 7.81368484114996e-07,
      "loss": 0.0019,
      "num_tokens": 190029129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2659
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8866666666666667,
      "grad_norm": 1.9010564233923333e-09,
      "kl": 0.04718017578125,
      "learning_rate": 7.768657768868803e-07,
      "loss": 0.0019,
      "num_tokens": 190105545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2660
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.887,
      "grad_norm": 1.2692297213234838e-09,
      "kl": 0.04833984375,
      "learning_rate": 7.723755564455771e-07,
      "loss": 0.0019,
      "num_tokens": 190179385.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2661
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8873333333333333,
      "grad_norm": 2.097444662396697e-09,
      "kl": 0.04620361328125,
      "learning_rate": 7.678978288701911e-07,
      "loss": 0.0018,
      "num_tokens": 190256297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2662
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8876666666666667,
      "grad_norm": 1.4038258333570752e-09,
      "kl": 0.04620361328125,
      "learning_rate": 7.634326002229175e-07,
      "loss": 0.0018,
      "num_tokens": 190330505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2663
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.888,
      "grad_norm": 1.939425287034169e-09,
      "kl": 0.046142578125,
      "learning_rate": 7.589798765490308e-07,
      "loss": 0.0018,
      "num_tokens": 190407161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2664
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8883333333333333,
      "grad_norm": 1.2057269627163691e-09,
      "kl": 0.04296875,
      "learning_rate": 7.545396638768698e-07,
      "loss": 0.0017,
      "num_tokens": 190481785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2665
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8886666666666667,
      "grad_norm": 1.552492467915556e-09,
      "kl": 0.04150390625,
      "learning_rate": 7.501119682178392e-07,
      "loss": 0.0017,
      "num_tokens": 190556217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2666
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.889,
      "grad_norm": 1.681061512215365e-09,
      "kl": 0.047119140625,
      "learning_rate": 7.456967955663996e-07,
      "loss": 0.0019,
      "num_tokens": 190631897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2667
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8893333333333333,
      "grad_norm": 9.356240227020862e-10,
      "kl": 0.04205322265625,
      "learning_rate": 7.412941519000527e-07,
      "loss": 0.0017,
      "num_tokens": 190707849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2668
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8896666666666667,
      "grad_norm": 1.1852688830416014e-09,
      "kl": 0.04541015625,
      "learning_rate": 7.369040431793406e-07,
      "loss": 0.0018,
      "num_tokens": 190781753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2669
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.89,
      "grad_norm": 1.3693829403749191e-09,
      "kl": 0.0457763671875,
      "learning_rate": 7.325264753478356e-07,
      "loss": 0.0018,
      "num_tokens": 190856585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2670
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8903333333333333,
      "grad_norm": 1.2500912527357855e-09,
      "kl": 0.04437255859375,
      "learning_rate": 7.281614543321269e-07,
      "loss": 0.0018,
      "num_tokens": 190934665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2671
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8906666666666667,
      "grad_norm": 1.3919477792612156e-09,
      "kl": 0.0418701171875,
      "learning_rate": 7.238089860418218e-07,
      "loss": 0.0017,
      "num_tokens": 191011209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2672
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.891,
      "grad_norm": 1.2578008634633875e-09,
      "kl": 0.0447998046875,
      "learning_rate": 7.194690763695312e-07,
      "loss": 0.0018,
      "num_tokens": 191084441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2673
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8913333333333333,
      "grad_norm": 1.169407903844899e-09,
      "kl": 0.04522705078125,
      "learning_rate": 7.151417311908648e-07,
      "loss": 0.0018,
      "num_tokens": 191160489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2674
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8916666666666667,
      "grad_norm": 1.7708263744253827e-09,
      "kl": 0.04779052734375,
      "learning_rate": 7.108269563644188e-07,
      "loss": 0.0019,
      "num_tokens": 191236841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2675
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.892,
      "grad_norm": 1.6522823109710316e-09,
      "kl": 0.04437255859375,
      "learning_rate": 7.065247577317747e-07,
      "loss": 0.0018,
      "num_tokens": 191311033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2676
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8923333333333333,
      "grad_norm": 3.0241904536154607e-09,
      "kl": 0.0447998046875,
      "learning_rate": 7.022351411174866e-07,
      "loss": 0.0018,
      "num_tokens": 191390281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2677
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8926666666666667,
      "grad_norm": 2.002348953311639e-09,
      "kl": 0.04443359375,
      "learning_rate": 6.979581123290702e-07,
      "loss": 0.0018,
      "num_tokens": 191466761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2678
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.893,
      "grad_norm": 1.2075672684019878e-09,
      "kl": 0.0467529296875,
      "learning_rate": 6.936936771570046e-07,
      "loss": 0.0019,
      "num_tokens": 191540665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2679
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8933333333333333,
      "grad_norm": 2.1923927118194797e-09,
      "kl": 0.04754638671875,
      "learning_rate": 6.894418413747183e-07,
      "loss": 0.0019,
      "num_tokens": 191615833.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2680
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8936666666666667,
      "grad_norm": 1.2265195525884565e-09,
      "kl": 0.0474853515625,
      "learning_rate": 6.852026107385756e-07,
      "loss": 0.0019,
      "num_tokens": 191690089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2681
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.894,
      "grad_norm": 3.240332668852375e-09,
      "kl": 0.048583984375,
      "learning_rate": 6.809759909878855e-07,
      "loss": 0.0019,
      "num_tokens": 191769321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2682
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8943333333333333,
      "grad_norm": 2.8002382634895184e-09,
      "kl": 0.04852294921875,
      "learning_rate": 6.767619878448783e-07,
      "loss": 0.0019,
      "num_tokens": 191845657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2683
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8946666666666667,
      "grad_norm": 1.597170173894824e-09,
      "kl": 0.040771484375,
      "learning_rate": 6.725606070147006e-07,
      "loss": 0.0016,
      "num_tokens": 191919689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2684
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.895,
      "grad_norm": 1.2723861964047956e-09,
      "kl": 0.0491943359375,
      "learning_rate": 6.683718541854134e-07,
      "loss": 0.002,
      "num_tokens": 191993817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2685
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8953333333333333,
      "grad_norm": 1.399685811698248e-09,
      "kl": 0.044677734375,
      "learning_rate": 6.641957350279838e-07,
      "loss": 0.0018,
      "num_tokens": 192073257.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2686
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8956666666666667,
      "grad_norm": 2.2074335692678915e-09,
      "kl": 0.0433349609375,
      "learning_rate": 6.60032255196268e-07,
      "loss": 0.0017,
      "num_tokens": 192152633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2687
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.896,
      "grad_norm": 1.1476454231384992e-09,
      "kl": 0.0440673828125,
      "learning_rate": 6.558814203270147e-07,
      "loss": 0.0018,
      "num_tokens": 192228345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2688
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8963333333333333,
      "grad_norm": 1.1839437208394088e-09,
      "kl": 0.04608154296875,
      "learning_rate": 6.517432360398556e-07,
      "loss": 0.0018,
      "num_tokens": 192302889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2689
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8966666666666666,
      "grad_norm": 1.531577975555365e-09,
      "kl": 0.0469970703125,
      "learning_rate": 6.476177079372903e-07,
      "loss": 0.0019,
      "num_tokens": 192376761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2690
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.897,
      "grad_norm": 1.2339079757950344e-09,
      "kl": 0.0438232421875,
      "learning_rate": 6.435048416046863e-07,
      "loss": 0.0018,
      "num_tokens": 192452553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2691
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8973333333333333,
      "grad_norm": 1.0130593031121293e-09,
      "kl": 0.04632568359375,
      "learning_rate": 6.394046426102673e-07,
      "loss": 0.0019,
      "num_tokens": 192526617.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2692
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8976666666666666,
      "grad_norm": 1.8076048435844427e-09,
      "kl": 0.0445556640625,
      "learning_rate": 6.353171165051109e-07,
      "loss": 0.0018,
      "num_tokens": 192601721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2693
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.898,
      "grad_norm": 1.4858033692277672e-09,
      "kl": 0.04437255859375,
      "learning_rate": 6.312422688231323e-07,
      "loss": 0.0018,
      "num_tokens": 192677961.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2694
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8983333333333333,
      "grad_norm": 1.174542352266883e-09,
      "kl": 0.04583740234375,
      "learning_rate": 6.271801050810856e-07,
      "loss": 0.0018,
      "num_tokens": 192752249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2695
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8986666666666666,
      "grad_norm": 1.7336796442890545e-09,
      "kl": 0.04986572265625,
      "learning_rate": 6.231306307785523e-07,
      "loss": 0.002,
      "num_tokens": 192828105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2696
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.899,
      "grad_norm": 1.159254914284702e-09,
      "kl": 0.0467529296875,
      "learning_rate": 6.190938513979317e-07,
      "loss": 0.0019,
      "num_tokens": 192902025.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2697
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8993333333333333,
      "grad_norm": 9.70292957092056e-10,
      "kl": 0.04730224609375,
      "learning_rate": 6.150697724044407e-07,
      "loss": 0.0019,
      "num_tokens": 192976777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2698
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.8996666666666666,
      "grad_norm": 2.9080164942740794e-09,
      "kl": 0.0467529296875,
      "learning_rate": 6.110583992460984e-07,
      "loss": 0.0019,
      "num_tokens": 193054809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2699
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9,
      "grad_norm": 1.814270844668897e-09,
      "kl": 0.04632568359375,
      "learning_rate": 6.070597373537201e-07,
      "loss": 0.0019,
      "num_tokens": 193136345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2700
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9003333333333333,
      "grad_norm": 1.4713595897219989e-09,
      "kl": 0.046142578125,
      "learning_rate": 6.030737921409169e-07,
      "loss": 0.0018,
      "num_tokens": 193214361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2701
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9006666666666666,
      "grad_norm": 1.4805883186141955e-09,
      "kl": 0.04803466796875,
      "learning_rate": 5.991005690040797e-07,
      "loss": 0.0019,
      "num_tokens": 193290985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2702
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.901,
      "grad_norm": 1.622341261331428e-09,
      "kl": 0.04644775390625,
      "learning_rate": 5.951400733223766e-07,
      "loss": 0.0019,
      "num_tokens": 193366473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2703
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9013333333333333,
      "grad_norm": 1.7025219012367643e-09,
      "kl": 0.046142578125,
      "learning_rate": 5.911923104577455e-07,
      "loss": 0.0018,
      "num_tokens": 193442265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2704
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9016666666666666,
      "grad_norm": 1.489463663517654e-09,
      "kl": 0.0452880859375,
      "learning_rate": 5.872572857548853e-07,
      "loss": 0.0018,
      "num_tokens": 193517129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2705
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.902,
      "grad_norm": 1.3886027883103225e-09,
      "kl": 0.04571533203125,
      "learning_rate": 5.833350045412478e-07,
      "loss": 0.0018,
      "num_tokens": 193592905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2706
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9023333333333333,
      "grad_norm": 1.3443077762076427e-09,
      "kl": 0.04425048828125,
      "learning_rate": 5.794254721270331e-07,
      "loss": 0.0018,
      "num_tokens": 193670201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2707
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9026666666666666,
      "grad_norm": 1.6732656371587495e-09,
      "kl": 0.0465087890625,
      "learning_rate": 5.75528693805183e-07,
      "loss": 0.0019,
      "num_tokens": 193746153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2708
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.903,
      "grad_norm": 2.249676001042644e-09,
      "kl": 0.04803466796875,
      "learning_rate": 5.716446748513682e-07,
      "loss": 0.0019,
      "num_tokens": 193824537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2709
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9033333333333333,
      "grad_norm": 1.4147114590912224e-09,
      "kl": 0.0478515625,
      "learning_rate": 5.677734205239904e-07,
      "loss": 0.0019,
      "num_tokens": 193899513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2710
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9036666666666666,
      "grad_norm": 2.226965722940122e-09,
      "kl": 0.047607421875,
      "learning_rate": 5.63914936064165e-07,
      "loss": 0.0019,
      "num_tokens": 193977161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2711
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.904,
      "grad_norm": 1.8142483071414972e-09,
      "kl": 0.0467529296875,
      "learning_rate": 5.600692266957208e-07,
      "loss": 0.0019,
      "num_tokens": 194053017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2712
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9043333333333333,
      "grad_norm": 1.6519959844529808e-09,
      "kl": 0.046142578125,
      "learning_rate": 5.562362976251901e-07,
      "loss": 0.0018,
      "num_tokens": 194129113.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2713
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9046666666666666,
      "grad_norm": 1.6164379834648912e-09,
      "kl": 0.04547119140625,
      "learning_rate": 5.524161540418039e-07,
      "loss": 0.0018,
      "num_tokens": 194205225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2714
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.905,
      "grad_norm": 1.5143343246037944e-09,
      "kl": 0.04541015625,
      "learning_rate": 5.48608801117485e-07,
      "loss": 0.0018,
      "num_tokens": 194280681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2715
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9053333333333333,
      "grad_norm": 1.825068540739494e-09,
      "kl": 0.0447998046875,
      "learning_rate": 5.448142440068316e-07,
      "loss": 0.0018,
      "num_tokens": 194354857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2716
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9056666666666666,
      "grad_norm": 9.96274396314334e-10,
      "kl": 0.04925537109375,
      "learning_rate": 5.410324878471296e-07,
      "loss": 0.002,
      "num_tokens": 194428601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2717
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.906,
      "grad_norm": 8.817634955526898e-10,
      "kl": 0.04949951171875,
      "learning_rate": 5.37263537758328e-07,
      "loss": 0.002,
      "num_tokens": 194502649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2718
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9063333333333333,
      "grad_norm": 1.4872045817071466e-09,
      "kl": 0.04766845703125,
      "learning_rate": 5.335073988430373e-07,
      "loss": 0.0019,
      "num_tokens": 194579481.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2719
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9066666666666666,
      "grad_norm": 1.5394499008891671e-09,
      "kl": 0.045166015625,
      "learning_rate": 5.297640761865242e-07,
      "loss": 0.0018,
      "num_tokens": 194655337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2720
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.907,
      "grad_norm": 1.8699926052079263e-09,
      "kl": 0.0457763671875,
      "learning_rate": 5.26033574856708e-07,
      "loss": 0.0018,
      "num_tokens": 194730265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2721
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9073333333333333,
      "grad_norm": 1.7702224130999866e-09,
      "kl": 0.0501708984375,
      "learning_rate": 5.223158999041444e-07,
      "loss": 0.002,
      "num_tokens": 194806393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2722
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9076666666666666,
      "grad_norm": 2.0958821345118395e-09,
      "kl": 0.046630859375,
      "learning_rate": 5.18611056362025e-07,
      "loss": 0.0019,
      "num_tokens": 194882057.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2723
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.908,
      "grad_norm": 2.8289557363336826e-09,
      "kl": 0.04193115234375,
      "learning_rate": 5.149190492461753e-07,
      "loss": 0.0017,
      "num_tokens": 194959673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2724
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9083333333333333,
      "grad_norm": 1.5199149716593752e-09,
      "kl": 0.04644775390625,
      "learning_rate": 5.112398835550348e-07,
      "loss": 0.0019,
      "num_tokens": 195032553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2725
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9086666666666666,
      "grad_norm": 1.6076604492099023e-09,
      "kl": 0.0457763671875,
      "learning_rate": 5.075735642696611e-07,
      "loss": 0.0018,
      "num_tokens": 195110185.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2726
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.909,
      "grad_norm": 2.8266640139662513e-09,
      "kl": 0.0478515625,
      "learning_rate": 5.039200963537194e-07,
      "loss": 0.0019,
      "num_tokens": 195187513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2727
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9093333333333333,
      "grad_norm": 1.8253267786150218e-09,
      "kl": 0.04547119140625,
      "learning_rate": 5.002794847534765e-07,
      "loss": 0.0018,
      "num_tokens": 195267353.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2728
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9096666666666666,
      "grad_norm": 1.1923446674444449e-09,
      "kl": 0.04266357421875,
      "learning_rate": 4.966517343977884e-07,
      "loss": 0.0017,
      "num_tokens": 195341513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2729
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.91,
      "grad_norm": 1.3433225642955904e-09,
      "kl": 0.04498291015625,
      "learning_rate": 4.930368501981097e-07,
      "loss": 0.0018,
      "num_tokens": 195417529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2730
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9103333333333333,
      "grad_norm": 1.479961930783702e-09,
      "kl": 0.04742431640625,
      "learning_rate": 4.894348370484648e-07,
      "loss": 0.0019,
      "num_tokens": 195493001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2731
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9106666666666666,
      "grad_norm": 1.5919801032993064e-09,
      "kl": 0.04547119140625,
      "learning_rate": 4.858456998254591e-07,
      "loss": 0.0018,
      "num_tokens": 195569241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2732
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.911,
      "grad_norm": 1.924354453564092e-09,
      "kl": 0.04046630859375,
      "learning_rate": 4.822694433882635e-07,
      "loss": 0.0016,
      "num_tokens": 195646089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2733
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9113333333333333,
      "grad_norm": 1.4513840129737332e-09,
      "kl": 0.04498291015625,
      "learning_rate": 4.787060725786141e-07,
      "loss": 0.0018,
      "num_tokens": 195720681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2734
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9116666666666666,
      "grad_norm": 1.7920511741209566e-09,
      "kl": 0.0455322265625,
      "learning_rate": 4.75155592220794e-07,
      "loss": 0.0018,
      "num_tokens": 195795529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2735
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.912,
      "grad_norm": 1.3332354109607536e-09,
      "kl": 0.0465087890625,
      "learning_rate": 4.7161800712163807e-07,
      "loss": 0.0019,
      "num_tokens": 195872105.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2736
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9123333333333333,
      "grad_norm": 2.0897301666877866e-09,
      "kl": 0.0477294921875,
      "learning_rate": 4.6809332207053083e-07,
      "loss": 0.0019,
      "num_tokens": 195948873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2737
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9126666666666666,
      "grad_norm": 1.4148687776938118e-09,
      "kl": 0.046630859375,
      "learning_rate": 4.6458154183937733e-07,
      "loss": 0.0019,
      "num_tokens": 196023193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2738
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.913,
      "grad_norm": 1.2989350706149594e-09,
      "kl": 0.046875,
      "learning_rate": 4.6108267118262327e-07,
      "loss": 0.0019,
      "num_tokens": 196097017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2739
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9133333333333333,
      "grad_norm": 1.7007215635800321e-09,
      "kl": 0.0482177734375,
      "learning_rate": 4.575967148372318e-07,
      "loss": 0.0019,
      "num_tokens": 196172569.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2740
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9136666666666666,
      "grad_norm": 2.5038797701881776e-09,
      "kl": 0.04400634765625,
      "learning_rate": 4.5412367752268094e-07,
      "loss": 0.0018,
      "num_tokens": 196250921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2741
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.914,
      "grad_norm": 1.3459204861732132e-09,
      "kl": 0.04351806640625,
      "learning_rate": 4.506635639409607e-07,
      "loss": 0.0017,
      "num_tokens": 196329817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2742
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9143333333333333,
      "grad_norm": 1.898086576801461e-09,
      "kl": 0.04833984375,
      "learning_rate": 4.4721637877656377e-07,
      "loss": 0.0019,
      "num_tokens": 196407385.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2743
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9146666666666666,
      "grad_norm": 8.262276973702853e-10,
      "kl": 0.043212890625,
      "learning_rate": 4.4378212669647814e-07,
      "loss": 0.0017,
      "num_tokens": 196481785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2744
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.915,
      "grad_norm": 1.3059461290154672e-09,
      "kl": 0.04620361328125,
      "learning_rate": 4.4036081235018347e-07,
      "loss": 0.0018,
      "num_tokens": 196556665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2745
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9153333333333333,
      "grad_norm": 2.2721404757675145e-09,
      "kl": 0.04449462890625,
      "learning_rate": 4.3695244036964567e-07,
      "loss": 0.0018,
      "num_tokens": 196634153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2746
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9156666666666666,
      "grad_norm": 2.4467967651986555e-09,
      "kl": 0.04669189453125,
      "learning_rate": 4.335570153693036e-07,
      "loss": 0.0019,
      "num_tokens": 196713465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2747
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.916,
      "grad_norm": 1.9677999230083287e-09,
      "kl": 0.04693603515625,
      "learning_rate": 4.301745419460712e-07,
      "loss": 0.0019,
      "num_tokens": 196789513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2748
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9163333333333333,
      "grad_norm": 1.5940316844265112e-09,
      "kl": 0.041748046875,
      "learning_rate": 4.268050246793276e-07,
      "loss": 0.0017,
      "num_tokens": 196863881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2749
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9166666666666666,
      "grad_norm": 2.232148466063677e-09,
      "kl": 0.04779052734375,
      "learning_rate": 4.234484681309103e-07,
      "loss": 0.0019,
      "num_tokens": 196941273.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2750
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.917,
      "grad_norm": 1.4069904130664668e-09,
      "kl": 0.04315185546875,
      "learning_rate": 4.2010487684511105e-07,
      "loss": 0.0017,
      "num_tokens": 197015753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2751
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9173333333333333,
      "grad_norm": 1.3602932114054056e-09,
      "kl": 0.04486083984375,
      "learning_rate": 4.167742553486676e-07,
      "loss": 0.0018,
      "num_tokens": 197091289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2752
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9176666666666666,
      "grad_norm": 1.836065743887616e-09,
      "kl": 0.0482177734375,
      "learning_rate": 4.134566081507585e-07,
      "loss": 0.0019,
      "num_tokens": 197166537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2753
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.918,
      "grad_norm": 8.54634196745252e-10,
      "kl": 0.0474853515625,
      "learning_rate": 4.101519397429976e-07,
      "loss": 0.0019,
      "num_tokens": 197240089.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2754
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9183333333333333,
      "grad_norm": 1.972982222042674e-09,
      "kl": 0.04437255859375,
      "learning_rate": 4.068602545994249e-07,
      "loss": 0.0018,
      "num_tokens": 197317497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2755
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9186666666666666,
      "grad_norm": 2.528717013561277e-09,
      "kl": 0.04681396484375,
      "learning_rate": 4.035815571765089e-07,
      "loss": 0.0019,
      "num_tokens": 197394905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2756
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.919,
      "grad_norm": 1.4672819625971556e-09,
      "kl": 0.04595947265625,
      "learning_rate": 4.003158519131245e-07,
      "loss": 0.0018,
      "num_tokens": 197471625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2757
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9193333333333333,
      "grad_norm": 1.781969127812033e-09,
      "kl": 0.0469970703125,
      "learning_rate": 3.9706314323056936e-07,
      "loss": 0.0019,
      "num_tokens": 197548681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2758
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9196666666666666,
      "grad_norm": 1.5447974011095766e-09,
      "kl": 0.04522705078125,
      "learning_rate": 3.9382343553253764e-07,
      "loss": 0.0018,
      "num_tokens": 197625481.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2759
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.92,
      "grad_norm": 1.4897868494401223e-09,
      "kl": 0.04791259765625,
      "learning_rate": 3.905967332051219e-07,
      "loss": 0.0019,
      "num_tokens": 197700649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2760
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9203333333333333,
      "grad_norm": 1.018335527014358e-09,
      "kl": 0.04583740234375,
      "learning_rate": 3.8738304061681107e-07,
      "loss": 0.0018,
      "num_tokens": 197773625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2761
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9206666666666666,
      "grad_norm": 1.5124627106288813e-09,
      "kl": 0.046142578125,
      "learning_rate": 3.8418236211848147e-07,
      "loss": 0.0018,
      "num_tokens": 197849769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2762
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.921,
      "grad_norm": 1.9083337132741462e-09,
      "kl": 0.0487060546875,
      "learning_rate": 3.809947020433824e-07,
      "loss": 0.0019,
      "num_tokens": 197926201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2763
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9213333333333333,
      "grad_norm": 1.0834209085430757e-09,
      "kl": 0.0426025390625,
      "learning_rate": 3.7782006470714614e-07,
      "loss": 0.0017,
      "num_tokens": 198000489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2764
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9216666666666666,
      "grad_norm": 1.6291868965012668e-09,
      "kl": 0.0426025390625,
      "learning_rate": 3.746584544077736e-07,
      "loss": 0.0017,
      "num_tokens": 198075593.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2765
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.922,
      "grad_norm": 1.491265666508923e-09,
      "kl": 0.04559326171875,
      "learning_rate": 3.715098754256241e-07,
      "loss": 0.0018,
      "num_tokens": 198150329.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2766
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9223333333333333,
      "grad_norm": 1.621741851920433e-09,
      "kl": 0.0455322265625,
      "learning_rate": 3.68374332023419e-07,
      "loss": 0.0018,
      "num_tokens": 198224953.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2767
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9226666666666666,
      "grad_norm": 1.5805363684506801e-09,
      "kl": 0.0438232421875,
      "learning_rate": 3.65251828446227e-07,
      "loss": 0.0018,
      "num_tokens": 198298169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2768
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.923,
      "grad_norm": 1.1675860278614891e-09,
      "kl": 0.04052734375,
      "learning_rate": 3.6214236892146983e-07,
      "loss": 0.0016,
      "num_tokens": 198372745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2769
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9233333333333333,
      "grad_norm": 1.20636134415264e-09,
      "kl": 0.04632568359375,
      "learning_rate": 3.590459576589e-07,
      "loss": 0.0019,
      "num_tokens": 198447657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2770
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9236666666666666,
      "grad_norm": 1.1482861328460103e-09,
      "kl": 0.04736328125,
      "learning_rate": 3.55962598850611e-07,
      "loss": 0.0019,
      "num_tokens": 198522361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2771
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.924,
      "grad_norm": 1.812141880996876e-09,
      "kl": 0.0443115234375,
      "learning_rate": 3.5289229667102463e-07,
      "loss": 0.0018,
      "num_tokens": 198598393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2772
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9243333333333333,
      "grad_norm": 1.741155108980763e-09,
      "kl": 0.04730224609375,
      "learning_rate": 3.498350552768859e-07,
      "loss": 0.0019,
      "num_tokens": 198674265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2773
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9246666666666666,
      "grad_norm": 2.4794506447989306e-09,
      "kl": 0.048583984375,
      "learning_rate": 3.467908788072538e-07,
      "loss": 0.0019,
      "num_tokens": 198749193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2774
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.925,
      "grad_norm": 1.3046874691724497e-09,
      "kl": 0.04547119140625,
      "learning_rate": 3.4375977138350615e-07,
      "loss": 0.0018,
      "num_tokens": 198825193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2775
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9253333333333333,
      "grad_norm": 2.1103452318982363e-09,
      "kl": 0.04705810546875,
      "learning_rate": 3.4074173710931804e-07,
      "loss": 0.0019,
      "num_tokens": 198901577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2776
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9256666666666666,
      "grad_norm": 1.3667855736088086e-09,
      "kl": 0.04852294921875,
      "learning_rate": 3.377367800706732e-07,
      "loss": 0.0019,
      "num_tokens": 198974889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2777
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.926,
      "grad_norm": 1.7653651873672516e-09,
      "kl": 0.04608154296875,
      "learning_rate": 3.347449043358475e-07,
      "loss": 0.0018,
      "num_tokens": 199051529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2778
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9263333333333333,
      "grad_norm": 1.382111536329944e-09,
      "kl": 0.041748046875,
      "learning_rate": 3.3176611395540625e-07,
      "loss": 0.0017,
      "num_tokens": 199125481.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2779
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9266666666666666,
      "grad_norm": 2.1750283796251324e-09,
      "kl": 0.044921875,
      "learning_rate": 3.288004129622013e-07,
      "loss": 0.0018,
      "num_tokens": 199203129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2780
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.927,
      "grad_norm": 1.929479465090367e-09,
      "kl": 0.04644775390625,
      "learning_rate": 3.2584780537136206e-07,
      "loss": 0.0019,
      "num_tokens": 199278489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2781
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9273333333333333,
      "grad_norm": 3.5863938485647395e-09,
      "kl": 0.04779052734375,
      "learning_rate": 3.2290829518028867e-07,
      "loss": 0.0019,
      "num_tokens": 199356841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2782
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9276666666666666,
      "grad_norm": 1.3098312434678405e-09,
      "kl": 0.0501708984375,
      "learning_rate": 3.1998188636865325e-07,
      "loss": 0.002,
      "num_tokens": 199431001.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2783
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.928,
      "grad_norm": 1.1819325518303003e-09,
      "kl": 0.04339599609375,
      "learning_rate": 3.1706858289838994e-07,
      "loss": 0.0017,
      "num_tokens": 199506969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2784
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9283333333333333,
      "grad_norm": 1.0809571016068276e-09,
      "kl": 0.044189453125,
      "learning_rate": 3.1416838871368925e-07,
      "loss": 0.0018,
      "num_tokens": 199582249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2785
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9286666666666666,
      "grad_norm": 2.2859372172945314e-09,
      "kl": 0.047607421875,
      "learning_rate": 3.112813077409926e-07,
      "loss": 0.0019,
      "num_tokens": 199658137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2786
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.929,
      "grad_norm": 2.359157980080795e-09,
      "kl": 0.0445556640625,
      "learning_rate": 3.0840734388898897e-07,
      "loss": 0.0018,
      "num_tokens": 199737033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2787
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9293333333333333,
      "grad_norm": 2.253692565901133e-09,
      "kl": 0.0445556640625,
      "learning_rate": 3.0554650104861137e-07,
      "loss": 0.0018,
      "num_tokens": 199813897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2788
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9296666666666666,
      "grad_norm": 9.50005962785383e-10,
      "kl": 0.04217529296875,
      "learning_rate": 3.026987830930239e-07,
      "loss": 0.0017,
      "num_tokens": 199888921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2789
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.93,
      "grad_norm": 2.291213663241365e-09,
      "kl": 0.04705810546875,
      "learning_rate": 2.9986419387762365e-07,
      "loss": 0.0019,
      "num_tokens": 199965993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2790
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9303333333333333,
      "grad_norm": 1.5451888657480595e-09,
      "kl": 0.04534912109375,
      "learning_rate": 2.970427372400353e-07,
      "loss": 0.0018,
      "num_tokens": 200042409.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2791
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9306666666666666,
      "grad_norm": 1.9826431607583572e-09,
      "kl": 0.04315185546875,
      "learning_rate": 2.94234417000101e-07,
      "loss": 0.0017,
      "num_tokens": 200117033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2792
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.931,
      "grad_norm": 1.260189841367776e-09,
      "kl": 0.04498291015625,
      "learning_rate": 2.9143923695987955e-07,
      "loss": 0.0018,
      "num_tokens": 200191673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2793
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9313333333333333,
      "grad_norm": 1.8147545688407263e-09,
      "kl": 0.0509033203125,
      "learning_rate": 2.8865720090364037e-07,
      "loss": 0.002,
      "num_tokens": 200267289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2794
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9316666666666666,
      "grad_norm": 2.1262047678050067e-09,
      "kl": 0.04559326171875,
      "learning_rate": 2.858883125978551e-07,
      "loss": 0.0018,
      "num_tokens": 200340969.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2795
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.932,
      "grad_norm": 1.1869067950698309e-09,
      "kl": 0.04559326171875,
      "learning_rate": 2.831325757911985e-07,
      "loss": 0.0018,
      "num_tokens": 200418809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2796
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9323333333333333,
      "grad_norm": 2.5276900572634986e-09,
      "kl": 0.0474853515625,
      "learning_rate": 2.8038999421453827e-07,
      "loss": 0.0019,
      "num_tokens": 200495081.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2797
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9326666666666666,
      "grad_norm": 1.8831241010985877e-09,
      "kl": 0.04547119140625,
      "learning_rate": 2.7766057158093217e-07,
      "loss": 0.0018,
      "num_tokens": 200571193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2798
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.933,
      "grad_norm": 2.656353359498098e-09,
      "kl": 0.04669189453125,
      "learning_rate": 2.749443115856232e-07,
      "loss": 0.0019,
      "num_tokens": 200648345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2799
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9333333333333333,
      "grad_norm": 2.2433330748583558e-09,
      "kl": 0.047119140625,
      "learning_rate": 2.7224121790603517e-07,
      "loss": 0.0019,
      "num_tokens": 200725449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2800
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9336666666666666,
      "grad_norm": 1.7229500048898672e-09,
      "kl": 0.03826904296875,
      "learning_rate": 2.6955129420176193e-07,
      "loss": 0.0015,
      "num_tokens": 200799817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2801
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.934,
      "grad_norm": 2.238356833217381e-09,
      "kl": 0.0465087890625,
      "learning_rate": 2.6687454411457256e-07,
      "loss": 0.0019,
      "num_tokens": 200876441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2802
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9343333333333333,
      "grad_norm": 1.3484483529779823e-09,
      "kl": 0.047119140625,
      "learning_rate": 2.6421097126839714e-07,
      "loss": 0.0019,
      "num_tokens": 200952649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2803
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9346666666666666,
      "grad_norm": 1.4187280239497113e-09,
      "kl": 0.04461669921875,
      "learning_rate": 2.6156057926932985e-07,
      "loss": 0.0018,
      "num_tokens": 201028633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2804
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.935,
      "grad_norm": 1.1227349050457747e-09,
      "kl": 0.047607421875,
      "learning_rate": 2.589233717056128e-07,
      "loss": 0.0019,
      "num_tokens": 201103385.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2805
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9353333333333333,
      "grad_norm": 9.553203561551982e-09,
      "kl": 0.042236328125,
      "learning_rate": 2.5629935214764866e-07,
      "loss": 0.0017,
      "num_tokens": 201184249.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2806
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9356666666666666,
      "grad_norm": 1.542145300348352e-09,
      "kl": 0.0440673828125,
      "learning_rate": 2.536885241479736e-07,
      "loss": 0.0018,
      "num_tokens": 201261801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2807
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.936,
      "grad_norm": 1.7628082327192374e-09,
      "kl": 0.04620361328125,
      "learning_rate": 2.510908912412746e-07,
      "loss": 0.0018,
      "num_tokens": 201336425.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2808
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9363333333333334,
      "grad_norm": 9.953243784721622e-10,
      "kl": 0.04473876953125,
      "learning_rate": 2.4850645694436736e-07,
      "loss": 0.0018,
      "num_tokens": 201410137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2809
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9366666666666666,
      "grad_norm": 2.7259841051119338e-09,
      "kl": 0.0478515625,
      "learning_rate": 2.4593522475620415e-07,
      "loss": 0.0019,
      "num_tokens": 201487641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2810
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.937,
      "grad_norm": 2.234079365948105e-09,
      "kl": 0.0472412109375,
      "learning_rate": 2.433771981578581e-07,
      "loss": 0.0019,
      "num_tokens": 201564377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2811
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9373333333333334,
      "grad_norm": 2.9833353565322795e-09,
      "kl": 0.04608154296875,
      "learning_rate": 2.4083238061252565e-07,
      "loss": 0.0018,
      "num_tokens": 201641337.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2812
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9376666666666666,
      "grad_norm": 1.631946910940485e-09,
      "kl": 0.04736328125,
      "learning_rate": 2.3830077556552424e-07,
      "loss": 0.0019,
      "num_tokens": 201717689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2813
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.938,
      "grad_norm": 1.9443995302736994e-09,
      "kl": 0.0460205078125,
      "learning_rate": 2.3578238644427763e-07,
      "loss": 0.0018,
      "num_tokens": 201792985.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2814
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9383333333333334,
      "grad_norm": 1.3453519409623027e-09,
      "kl": 0.04547119140625,
      "learning_rate": 2.332772166583208e-07,
      "loss": 0.0018,
      "num_tokens": 201867145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2815
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9386666666666666,
      "grad_norm": 1.4726727615155255e-09,
      "kl": 0.0496826171875,
      "learning_rate": 2.307852695992907e-07,
      "loss": 0.002,
      "num_tokens": 201941385.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2816
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.939,
      "grad_norm": 1.1133699517884565e-09,
      "kl": 0.0416259765625,
      "learning_rate": 2.2830654864092083e-07,
      "loss": 0.0017,
      "num_tokens": 202016377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2817
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9393333333333334,
      "grad_norm": 1.8193979656189185e-09,
      "kl": 0.04669189453125,
      "learning_rate": 2.2584105713904126e-07,
      "loss": 0.0019,
      "num_tokens": 202091865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2818
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9396666666666667,
      "grad_norm": 1.4408650939046197e-09,
      "kl": 0.0439453125,
      "learning_rate": 2.233887984315697e-07,
      "loss": 0.0018,
      "num_tokens": 202166857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2819
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.94,
      "grad_norm": 1.6705423711016465e-09,
      "kl": 0.04229736328125,
      "learning_rate": 2.209497758385104e-07,
      "loss": 0.0017,
      "num_tokens": 202243161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2820
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9403333333333334,
      "grad_norm": 1.6784451606355333e-09,
      "kl": 0.04595947265625,
      "learning_rate": 2.1852399266194312e-07,
      "loss": 0.0018,
      "num_tokens": 202318217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2821
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9406666666666667,
      "grad_norm": 1.1895961993246829e-09,
      "kl": 0.0401611328125,
      "learning_rate": 2.161114521860308e-07,
      "loss": 0.0016,
      "num_tokens": 202393801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2822
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.941,
      "grad_norm": 1.2220880973856652e-09,
      "kl": 0.04547119140625,
      "learning_rate": 2.137121576770007e-07,
      "loss": 0.0018,
      "num_tokens": 202469145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2823
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9413333333333334,
      "grad_norm": 1.229170432104354e-09,
      "kl": 0.04510498046875,
      "learning_rate": 2.1132611238315004e-07,
      "loss": 0.0018,
      "num_tokens": 202543769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2824
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9416666666666667,
      "grad_norm": 2.06150896353563e-09,
      "kl": 0.04705810546875,
      "learning_rate": 2.089533195348392e-07,
      "loss": 0.0019,
      "num_tokens": 202619321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2825
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.942,
      "grad_norm": 1.8419480385389875e-09,
      "kl": 0.0477294921875,
      "learning_rate": 2.0659378234448524e-07,
      "loss": 0.0019,
      "num_tokens": 202697465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2826
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9423333333333334,
      "grad_norm": 1.99699923264518e-09,
      "kl": 0.046630859375,
      "learning_rate": 2.0424750400655947e-07,
      "loss": 0.0019,
      "num_tokens": 202774009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2827
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9426666666666667,
      "grad_norm": 1.5248219353836134e-09,
      "kl": 0.0469970703125,
      "learning_rate": 2.0191448769758315e-07,
      "loss": 0.0019,
      "num_tokens": 202850585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2828
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.943,
      "grad_norm": 1.2909239233138692e-09,
      "kl": 0.04510498046875,
      "learning_rate": 1.9959473657612193e-07,
      "loss": 0.0018,
      "num_tokens": 202924697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2829
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9433333333333334,
      "grad_norm": 2.3433068818690117e-09,
      "kl": 0.0450439453125,
      "learning_rate": 1.9728825378278248e-07,
      "loss": 0.0018,
      "num_tokens": 203001513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2830
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9436666666666667,
      "grad_norm": 1.8256807177152723e-09,
      "kl": 0.04425048828125,
      "learning_rate": 1.9499504244020694e-07,
      "loss": 0.0018,
      "num_tokens": 203078473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2831
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.944,
      "grad_norm": 1.4050293151157689e-09,
      "kl": 0.038543701171875,
      "learning_rate": 1.9271510565307405e-07,
      "loss": 0.0015,
      "num_tokens": 203158313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2832
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9443333333333334,
      "grad_norm": 1.134748295328336e-09,
      "kl": 0.04412841796875,
      "learning_rate": 1.9044844650808468e-07,
      "loss": 0.0018,
      "num_tokens": 203232505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2833
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9446666666666667,
      "grad_norm": 1.644116731647216e-09,
      "kl": 0.047607421875,
      "learning_rate": 1.8819506807396748e-07,
      "loss": 0.0019,
      "num_tokens": 203309881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2834
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.945,
      "grad_norm": 2.5901472078260213e-09,
      "kl": 0.04547119140625,
      "learning_rate": 1.8595497340147316e-07,
      "loss": 0.0018,
      "num_tokens": 203384505.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2835
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9453333333333334,
      "grad_norm": 1.6057630780608179e-09,
      "kl": 0.0477294921875,
      "learning_rate": 1.8372816552336025e-07,
      "loss": 0.0019,
      "num_tokens": 203460361.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2836
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9456666666666667,
      "grad_norm": 1.5923178331433974e-09,
      "kl": 0.045166015625,
      "learning_rate": 1.8151464745440828e-07,
      "loss": 0.0018,
      "num_tokens": 203537129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2837
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.946,
      "grad_norm": 1.0007723538763003e-09,
      "kl": 0.04400634765625,
      "learning_rate": 1.793144221913967e-07,
      "loss": 0.0018,
      "num_tokens": 203610777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2838
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9463333333333334,
      "grad_norm": 1.3451804115049981e-09,
      "kl": 0.04827880859375,
      "learning_rate": 1.7712749271311392e-07,
      "loss": 0.0019,
      "num_tokens": 203686825.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2839
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9466666666666667,
      "grad_norm": 1.4056300567943936e-09,
      "kl": 0.04254150390625,
      "learning_rate": 1.7495386198034258e-07,
      "loss": 0.0017,
      "num_tokens": 203762345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2840
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.947,
      "grad_norm": 1.567531437984826e-09,
      "kl": 0.0479736328125,
      "learning_rate": 1.7279353293586765e-07,
      "loss": 0.0019,
      "num_tokens": 203837881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2841
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9473333333333334,
      "grad_norm": 1.19234444539984e-09,
      "kl": 0.04571533203125,
      "learning_rate": 1.706465085044584e-07,
      "loss": 0.0018,
      "num_tokens": 203916169.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2842
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9476666666666667,
      "grad_norm": 1.7183039435764158e-09,
      "kl": 0.0445556640625,
      "learning_rate": 1.6851279159287526e-07,
      "loss": 0.0018,
      "num_tokens": 203991529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2843
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.948,
      "grad_norm": 1.2892065193170765e-09,
      "kl": 0.04473876953125,
      "learning_rate": 1.6639238508986188e-07,
      "loss": 0.0018,
      "num_tokens": 204067369.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2844
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9483333333333334,
      "grad_norm": 1.4408597648341015e-09,
      "kl": 0.047119140625,
      "learning_rate": 1.6428529186614195e-07,
      "loss": 0.0019,
      "num_tokens": 204141913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2845
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9486666666666667,
      "grad_norm": 1.7149521802650725e-09,
      "kl": 0.04803466796875,
      "learning_rate": 1.6219151477441243e-07,
      "loss": 0.0019,
      "num_tokens": 204217545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2846
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.949,
      "grad_norm": 3.3588483105972955e-09,
      "kl": 0.04742431640625,
      "learning_rate": 1.601110566493458e-07,
      "loss": 0.0019,
      "num_tokens": 204296153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2847
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9493333333333334,
      "grad_norm": 1.1913562358856211e-09,
      "kl": 0.0419921875,
      "learning_rate": 1.580439203075812e-07,
      "loss": 0.0017,
      "num_tokens": 204370281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2848
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9496666666666667,
      "grad_norm": 1.0094374225388947e-09,
      "kl": 0.045166015625,
      "learning_rate": 1.5599010854772002e-07,
      "loss": 0.0018,
      "num_tokens": 204445161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2849
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.95,
      "grad_norm": 1.824966289198926e-09,
      "kl": 0.0467529296875,
      "learning_rate": 1.5394962415032578e-07,
      "loss": 0.0019,
      "num_tokens": 204521017.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2850
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9503333333333334,
      "grad_norm": 2.020463352181423e-09,
      "kl": 0.04803466796875,
      "learning_rate": 1.519224698779198e-07,
      "loss": 0.0019,
      "num_tokens": 204596201.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2851
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9506666666666667,
      "grad_norm": 1.8384758160294723e-09,
      "kl": 0.0477294921875,
      "learning_rate": 1.4990864847497456e-07,
      "loss": 0.0019,
      "num_tokens": 204671193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2852
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.951,
      "grad_norm": 2.270664323233973e-09,
      "kl": 0.0450439453125,
      "learning_rate": 1.4790816266791018e-07,
      "loss": 0.0018,
      "num_tokens": 204746265.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2853
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9513333333333334,
      "grad_norm": 1.0415018847353963e-09,
      "kl": 0.0452880859375,
      "learning_rate": 1.4592101516509916e-07,
      "loss": 0.0018,
      "num_tokens": 204824121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2854
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9516666666666667,
      "grad_norm": 2.003548438267444e-09,
      "kl": 0.04718017578125,
      "learning_rate": 1.4394720865684718e-07,
      "loss": 0.0019,
      "num_tokens": 204900937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2855
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.952,
      "grad_norm": 1.874883803765215e-09,
      "kl": 0.0455322265625,
      "learning_rate": 1.419867458154034e-07,
      "loss": 0.0018,
      "num_tokens": 204978665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2856
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9523333333333334,
      "grad_norm": 1.3125730502494548e-09,
      "kl": 0.04522705078125,
      "learning_rate": 1.400396292949513e-07,
      "loss": 0.0018,
      "num_tokens": 205055305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2857
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9526666666666667,
      "grad_norm": 2.379470176450127e-09,
      "kl": 0.04522705078125,
      "learning_rate": 1.3810586173160224e-07,
      "loss": 0.0018,
      "num_tokens": 205132313.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2858
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.953,
      "grad_norm": 1.365435209343957e-09,
      "kl": 0.04461669921875,
      "learning_rate": 1.3618544574339976e-07,
      "loss": 0.0018,
      "num_tokens": 205208585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2859
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9533333333333334,
      "grad_norm": 2.9286539859896266e-09,
      "kl": 0.04425048828125,
      "learning_rate": 1.3427838393030634e-07,
      "loss": 0.0018,
      "num_tokens": 205286121.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2860
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9536666666666667,
      "grad_norm": 1.1676538624882937e-09,
      "kl": 0.04595947265625,
      "learning_rate": 1.323846788742078e-07,
      "loss": 0.0018,
      "num_tokens": 205361657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2861
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.954,
      "grad_norm": 1.8650967437139343e-09,
      "kl": 0.04559326171875,
      "learning_rate": 1.3050433313890774e-07,
      "loss": 0.0018,
      "num_tokens": 205436889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2862
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9543333333333334,
      "grad_norm": 1.7542512997792414e-09,
      "kl": 0.04840087890625,
      "learning_rate": 1.2863734927012094e-07,
      "loss": 0.0019,
      "num_tokens": 205511801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2863
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9546666666666667,
      "grad_norm": 1.8528545364659976e-09,
      "kl": 0.0452880859375,
      "learning_rate": 1.2678372979547326e-07,
      "loss": 0.0018,
      "num_tokens": 205588649.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2864
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.955,
      "grad_norm": 1.8584306316071775e-09,
      "kl": 0.04400634765625,
      "learning_rate": 1.2494347722449506e-07,
      "loss": 0.0018,
      "num_tokens": 205663849.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2865
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9553333333333334,
      "grad_norm": 9.341071249835409e-10,
      "kl": 0.04730224609375,
      "learning_rate": 1.231165940486234e-07,
      "loss": 0.0019,
      "num_tokens": 205739769.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2866
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9556666666666667,
      "grad_norm": 2.132524601350383e-09,
      "kl": 0.0435791015625,
      "learning_rate": 1.2130308274119207e-07,
      "loss": 0.0017,
      "num_tokens": 205815513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2867
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.956,
      "grad_norm": 2.035346113871128e-09,
      "kl": 0.04541015625,
      "learning_rate": 1.1950294575743372e-07,
      "loss": 0.0018,
      "num_tokens": 205891225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2868
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9563333333333334,
      "grad_norm": 1.5192248570272682e-09,
      "kl": 0.04150390625,
      "learning_rate": 1.1771618553447217e-07,
      "loss": 0.0017,
      "num_tokens": 205971609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2869
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9566666666666667,
      "grad_norm": 1.6769144961514826e-09,
      "kl": 0.0489501953125,
      "learning_rate": 1.1594280449132245e-07,
      "loss": 0.002,
      "num_tokens": 206046729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2870
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.957,
      "grad_norm": 1.3430211387444047e-09,
      "kl": 0.043212890625,
      "learning_rate": 1.1418280502888401e-07,
      "loss": 0.0017,
      "num_tokens": 206119801.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2871
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9573333333333334,
      "grad_norm": 1.8542503088525564e-09,
      "kl": 0.044189453125,
      "learning_rate": 1.1243618952994195e-07,
      "loss": 0.0018,
      "num_tokens": 206195625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2872
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9576666666666667,
      "grad_norm": 1.0374904269028207e-09,
      "kl": 0.04833984375,
      "learning_rate": 1.1070296035916028e-07,
      "loss": 0.0019,
      "num_tokens": 206270905.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2873
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.958,
      "grad_norm": 1.4874067533199309e-09,
      "kl": 0.0458984375,
      "learning_rate": 1.0898311986307975e-07,
      "loss": 0.0018,
      "num_tokens": 206345721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2874
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9583333333333334,
      "grad_norm": 1.2928865755768015e-09,
      "kl": 0.04388427734375,
      "learning_rate": 1.0727667037011668e-07,
      "loss": 0.0018,
      "num_tokens": 206419321.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2875
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9586666666666667,
      "grad_norm": 1.1247179854123601e-09,
      "kl": 0.0517578125,
      "learning_rate": 1.055836141905553e-07,
      "loss": 0.0021,
      "num_tokens": 206495753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2876
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.959,
      "grad_norm": 1.0328591315555968e-09,
      "kl": 0.04681396484375,
      "learning_rate": 1.039039536165476e-07,
      "loss": 0.0019,
      "num_tokens": 206569737.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2877
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9593333333333334,
      "grad_norm": 1.6002604796838682e-09,
      "kl": 0.047119140625,
      "learning_rate": 1.0223769092211012e-07,
      "loss": 0.0019,
      "num_tokens": 206645497.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2878
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9596666666666667,
      "grad_norm": 1.1132157418103361e-09,
      "kl": 0.04376220703125,
      "learning_rate": 1.0058482836312278e-07,
      "loss": 0.0018,
      "num_tokens": 206718857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2879
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.96,
      "grad_norm": 1.2266860860421502e-09,
      "kl": 0.04254150390625,
      "learning_rate": 9.894536817732226e-08,
      "loss": 0.0017,
      "num_tokens": 206795097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2880
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9603333333333334,
      "grad_norm": 1.0729875876691608e-09,
      "kl": 0.04473876953125,
      "learning_rate": 9.731931258429638e-08,
      "loss": 0.0018,
      "num_tokens": 206872473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2881
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9606666666666667,
      "grad_norm": 2.1439465758277265e-09,
      "kl": 0.0478515625,
      "learning_rate": 9.57066637854931e-08,
      "loss": 0.0019,
      "num_tokens": 206949609.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2882
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.961,
      "grad_norm": 1.2095711099391337e-09,
      "kl": 0.046142578125,
      "learning_rate": 9.410742396420259e-08,
      "loss": 0.0018,
      "num_tokens": 207024793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2883
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9613333333333334,
      "grad_norm": 1.4314726071162909e-09,
      "kl": 0.04443359375,
      "learning_rate": 9.252159528556404e-08,
      "loss": 0.0018,
      "num_tokens": 207100761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2884
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9616666666666667,
      "grad_norm": 2.614742644624357e-09,
      "kl": 0.04522705078125,
      "learning_rate": 9.094917989656005e-08,
      "loss": 0.0018,
      "num_tokens": 207177289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2885
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.962,
      "grad_norm": 1.0771308289747594e-09,
      "kl": 0.0433349609375,
      "learning_rate": 8.939017992601329e-08,
      "loss": 0.0017,
      "num_tokens": 207252729.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2886
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9623333333333334,
      "grad_norm": 1.5476608883346898e-09,
      "kl": 0.05035400390625,
      "learning_rate": 8.784459748458318e-08,
      "loss": 0.002,
      "num_tokens": 207328153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2887
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9626666666666667,
      "grad_norm": 2.431616463738351e-09,
      "kl": 0.0460205078125,
      "learning_rate": 8.631243466476368e-08,
      "loss": 0.0018,
      "num_tokens": 207404345.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2888
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.963,
      "grad_norm": 1.7921398809406242e-09,
      "kl": 0.04473876953125,
      "learning_rate": 8.479369354088329e-08,
      "loss": 0.0018,
      "num_tokens": 207482153.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2889
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9633333333333334,
      "grad_norm": 1.2141361249717875e-09,
      "kl": 0.04571533203125,
      "learning_rate": 8.328837616909612e-08,
      "loss": 0.0018,
      "num_tokens": 207558185.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2890
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9636666666666667,
      "grad_norm": 1.8377752653009338e-09,
      "kl": 0.046630859375,
      "learning_rate": 8.179648458738309e-08,
      "loss": 0.0019,
      "num_tokens": 207633193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2891
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.964,
      "grad_norm": 1.3388734565467075e-09,
      "kl": 0.047119140625,
      "learning_rate": 8.031802081554963e-08,
      "loss": 0.0019,
      "num_tokens": 207708681.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2892
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9643333333333334,
      "grad_norm": 1.762052503906375e-09,
      "kl": 0.0487060546875,
      "learning_rate": 7.885298685522235e-08,
      "loss": 0.0019,
      "num_tokens": 207784809.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2893
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9646666666666667,
      "grad_norm": 9.543931200894917e-10,
      "kl": 0.04534912109375,
      "learning_rate": 7.740138468984249e-08,
      "loss": 0.0018,
      "num_tokens": 207859353.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2894
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.965,
      "grad_norm": 1.5075356518678973e-09,
      "kl": 0.045654296875,
      "learning_rate": 7.596321628467129e-08,
      "loss": 0.0018,
      "num_tokens": 207935097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2895
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9653333333333334,
      "grad_norm": 1.4149178495515002e-09,
      "kl": 0.044921875,
      "learning_rate": 7.453848358678018e-08,
      "loss": 0.0018,
      "num_tokens": 208010857.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2896
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9656666666666667,
      "grad_norm": 1.5315223533818312e-09,
      "kl": 0.04779052734375,
      "learning_rate": 7.31271885250484e-08,
      "loss": 0.0019,
      "num_tokens": 208085225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2897
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.966,
      "grad_norm": 1.2852930941775753e-09,
      "kl": 0.04315185546875,
      "learning_rate": 7.17293330101676e-08,
      "loss": 0.0017,
      "num_tokens": 208161225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2898
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9663333333333334,
      "grad_norm": 1.6096177724023164e-09,
      "kl": 0.0455322265625,
      "learning_rate": 7.034491893463059e-08,
      "loss": 0.0018,
      "num_tokens": 208236185.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2899
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9666666666666667,
      "grad_norm": 1.301025953637236e-09,
      "kl": 0.048583984375,
      "learning_rate": 6.897394817273251e-08,
      "loss": 0.0019,
      "num_tokens": 208309817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2900
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.967,
      "grad_norm": 1.3912451191089303e-09,
      "kl": 0.0452880859375,
      "learning_rate": 6.761642258056977e-08,
      "loss": 0.0018,
      "num_tokens": 208384601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2901
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9673333333333334,
      "grad_norm": 1.964226781225875e-09,
      "kl": 0.04876708984375,
      "learning_rate": 6.627234399603554e-08,
      "loss": 0.0019,
      "num_tokens": 208460553.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2902
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9676666666666667,
      "grad_norm": 1.1935673560614646e-09,
      "kl": 0.04437255859375,
      "learning_rate": 6.494171423881756e-08,
      "loss": 0.0018,
      "num_tokens": 208537145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2903
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.968,
      "grad_norm": 1.7335108903893115e-09,
      "kl": 0.0479736328125,
      "learning_rate": 6.362453511039368e-08,
      "loss": 0.0019,
      "num_tokens": 208612665.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2904
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9683333333333334,
      "grad_norm": 2.3246899960582823e-09,
      "kl": 0.04437255859375,
      "learning_rate": 6.232080839403631e-08,
      "loss": 0.0018,
      "num_tokens": 208689641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2905
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9686666666666667,
      "grad_norm": 1.3625027772690146e-09,
      "kl": 0.04541015625,
      "learning_rate": 6.103053585480023e-08,
      "loss": 0.0018,
      "num_tokens": 208766793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2906
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.969,
      "grad_norm": 1.274568006692789e-09,
      "kl": 0.04608154296875,
      "learning_rate": 5.975371923952921e-08,
      "loss": 0.0018,
      "num_tokens": 208841209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2907
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9693333333333334,
      "grad_norm": 9.741473183666471e-10,
      "kl": 0.0457763671875,
      "learning_rate": 5.849036027684607e-08,
      "loss": 0.0018,
      "num_tokens": 208915161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2908
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9696666666666667,
      "grad_norm": 1.227282053761769e-09,
      "kl": 0.041748046875,
      "learning_rate": 5.724046067715705e-08,
      "loss": 0.0017,
      "num_tokens": 208991033.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2909
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.97,
      "grad_norm": 1.3553156374968012e-09,
      "kl": 0.04925537109375,
      "learning_rate": 5.600402213264411e-08,
      "loss": 0.002,
      "num_tokens": 209067465.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2910
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9703333333333334,
      "grad_norm": 1.697823548418853e-09,
      "kl": 0.0477294921875,
      "learning_rate": 5.4781046317267103e-08,
      "loss": 0.0019,
      "num_tokens": 209142793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2911
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9706666666666667,
      "grad_norm": 1.3070936555337198e-09,
      "kl": 0.0447998046875,
      "learning_rate": 5.3571534886756035e-08,
      "loss": 0.0018,
      "num_tokens": 209217657.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2912
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.971,
      "grad_norm": 1.4478411802798519e-09,
      "kl": 0.04486083984375,
      "learning_rate": 5.2375489478616593e-08,
      "loss": 0.0018,
      "num_tokens": 209292297.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2913
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9713333333333334,
      "grad_norm": 1.4970747974629717e-09,
      "kl": 0.044189453125,
      "learning_rate": 5.119291171211793e-08,
      "loss": 0.0018,
      "num_tokens": 209367161.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2914
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9716666666666667,
      "grad_norm": 1.5472362280277707e-09,
      "kl": 0.04730224609375,
      "learning_rate": 5.002380318830158e-08,
      "loss": 0.0019,
      "num_tokens": 209443545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2915
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.972,
      "grad_norm": 2.7980964212304116e-09,
      "kl": 0.0452880859375,
      "learning_rate": 4.88681654899692e-08,
      "loss": 0.0018,
      "num_tokens": 209519881.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2916
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9723333333333334,
      "grad_norm": 1.9377759397087857e-09,
      "kl": 0.04351806640625,
      "learning_rate": 4.772600018168816e-08,
      "loss": 0.0017,
      "num_tokens": 209599225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2917
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9726666666666667,
      "grad_norm": 9.536348377636727e-10,
      "kl": 0.0439453125,
      "learning_rate": 4.659730880978375e-08,
      "loss": 0.0018,
      "num_tokens": 209673049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2918
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.973,
      "grad_norm": 1.9756634106471438e-09,
      "kl": 0.0474853515625,
      "learning_rate": 4.54820929023414e-08,
      "loss": 0.0019,
      "num_tokens": 209749225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2919
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9733333333333334,
      "grad_norm": 1.550075512390947e-09,
      "kl": 0.0458984375,
      "learning_rate": 4.438035396920004e-08,
      "loss": 0.0018,
      "num_tokens": 209827097.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2920
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9736666666666667,
      "grad_norm": 1.2801216753288713e-09,
      "kl": 0.0469970703125,
      "learning_rate": 4.329209350195651e-08,
      "loss": 0.0019,
      "num_tokens": 209900393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2921
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.974,
      "grad_norm": 1.7890703363221405e-09,
      "kl": 0.04486083984375,
      "learning_rate": 4.2217312973955594e-08,
      "loss": 0.0018,
      "num_tokens": 209976793.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2922
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9743333333333334,
      "grad_norm": 9.99578309013316e-10,
      "kl": 0.04931640625,
      "learning_rate": 4.115601384029666e-08,
      "loss": 0.002,
      "num_tokens": 210052041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2923
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9746666666666667,
      "grad_norm": 2.530295750702294e-09,
      "kl": 0.0452880859375,
      "learning_rate": 4.010819753782369e-08,
      "loss": 0.0018,
      "num_tokens": 210128873.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2924
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.975,
      "grad_norm": 1.4857606256413192e-09,
      "kl": 0.0472412109375,
      "learning_rate": 3.907386548512748e-08,
      "loss": 0.0019,
      "num_tokens": 210204585.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2925
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9753333333333334,
      "grad_norm": 1.181739484046318e-09,
      "kl": 0.042724609375,
      "learning_rate": 3.805301908254455e-08,
      "loss": 0.0017,
      "num_tokens": 210280009.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2926
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9756666666666667,
      "grad_norm": 1.7818566622196386e-09,
      "kl": 0.0450439453125,
      "learning_rate": 3.704565971215379e-08,
      "loss": 0.0018,
      "num_tokens": 210355865.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2927
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.976,
      "grad_norm": 1.3478381744036483e-09,
      "kl": 0.04571533203125,
      "learning_rate": 3.605178873777204e-08,
      "loss": 0.0018,
      "num_tokens": 210429641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2928
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9763333333333334,
      "grad_norm": 9.184745186630039e-10,
      "kl": 0.0430908203125,
      "learning_rate": 3.50714075049563e-08,
      "loss": 0.0017,
      "num_tokens": 210504073.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2929
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9766666666666667,
      "grad_norm": 3.5453104896276955e-09,
      "kl": 0.04327392578125,
      "learning_rate": 3.410451734100262e-08,
      "loss": 0.0017,
      "num_tokens": 210582713.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2930
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.977,
      "grad_norm": 1.5028326361132827e-09,
      "kl": 0.04400634765625,
      "learning_rate": 3.315111955493944e-08,
      "loss": 0.0018,
      "num_tokens": 210659209.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2931
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9773333333333334,
      "grad_norm": 1.7363386284330318e-09,
      "kl": 0.0452880859375,
      "learning_rate": 3.22112154375287e-08,
      "loss": 0.0018,
      "num_tokens": 210735129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2932
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9776666666666667,
      "grad_norm": 1.4197287789841084e-09,
      "kl": 0.04571533203125,
      "learning_rate": 3.1284806261264735e-08,
      "loss": 0.0018,
      "num_tokens": 210810441.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2933
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.978,
      "grad_norm": 1.538000504730519e-09,
      "kl": 0.0491943359375,
      "learning_rate": 3.037189328036982e-08,
      "loss": 0.002,
      "num_tokens": 210884745.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2934
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9783333333333334,
      "grad_norm": 2.8975926102958738e-09,
      "kl": 0.04522705078125,
      "learning_rate": 2.947247773079753e-08,
      "loss": 0.0018,
      "num_tokens": 210962633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2935
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9786666666666667,
      "grad_norm": 1.5917376305907283e-09,
      "kl": 0.04351806640625,
      "learning_rate": 2.858656083022604e-08,
      "loss": 0.0017,
      "num_tokens": 211038937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2936
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.979,
      "grad_norm": 8.104458770752387e-10,
      "kl": 0.04290771484375,
      "learning_rate": 2.7714143778058146e-08,
      "loss": 0.0017,
      "num_tokens": 211112841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2937
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9793333333333333,
      "grad_norm": 1.6967492966202258e-09,
      "kl": 0.04425048828125,
      "learning_rate": 2.6855227755419046e-08,
      "loss": 0.0018,
      "num_tokens": 211191225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2938
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9796666666666667,
      "grad_norm": 2.49811260566446e-09,
      "kl": 0.04876708984375,
      "learning_rate": 2.6009813925157446e-08,
      "loss": 0.0019,
      "num_tokens": 211266281.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2939
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.98,
      "grad_norm": 1.821839790139279e-09,
      "kl": 0.04718017578125,
      "learning_rate": 2.5177903431842233e-08,
      "loss": 0.0019,
      "num_tokens": 211342377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2940
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9803333333333333,
      "grad_norm": 1.2046053043945903e-09,
      "kl": 0.04461669921875,
      "learning_rate": 2.4359497401758026e-08,
      "loss": 0.0018,
      "num_tokens": 211416777.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2941
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9806666666666667,
      "grad_norm": 1.3474217297471114e-09,
      "kl": 0.04681396484375,
      "learning_rate": 2.3554596942907404e-08,
      "loss": 0.0019,
      "num_tokens": 211491065.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2942
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.981,
      "grad_norm": 1.6224258603259045e-09,
      "kl": 0.04571533203125,
      "learning_rate": 2.2763203145010904e-08,
      "loss": 0.0018,
      "num_tokens": 211567145.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2943
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9813333333333333,
      "grad_norm": 1.3303408374909509e-09,
      "kl": 0.04632568359375,
      "learning_rate": 2.1985317079500358e-08,
      "loss": 0.0019,
      "num_tokens": 211641929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2944
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9816666666666667,
      "grad_norm": 1.0727693178225195e-09,
      "kl": 0.04339599609375,
      "learning_rate": 2.1220939799520003e-08,
      "loss": 0.0017,
      "num_tokens": 211717049.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2945
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.982,
      "grad_norm": 1.5592430679944869e-09,
      "kl": 0.0450439453125,
      "learning_rate": 2.0470072339926482e-08,
      "loss": 0.0018,
      "num_tokens": 211791913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2946
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9823333333333333,
      "grad_norm": 1.6663457280685634e-09,
      "kl": 0.0469970703125,
      "learning_rate": 1.973271571728441e-08,
      "loss": 0.0019,
      "num_tokens": 211868921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2947
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9826666666666667,
      "grad_norm": 1.7467541857385527e-09,
      "kl": 0.04486083984375,
      "learning_rate": 1.9008870929869692e-08,
      "loss": 0.0018,
      "num_tokens": 211947673.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": NaN,
      "rewards/penalized_accuracy_reward/std": NaN,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2948
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.983,
      "grad_norm": 1.6448671313895602e-09,
      "kl": 0.04595947265625,
      "learning_rate": 1.829853895766176e-08,
      "loss": 0.0018,
      "num_tokens": 212021193.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2949
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9833333333333333,
      "grad_norm": 2.2270771893317942e-09,
      "kl": 0.04315185546875,
      "learning_rate": 1.7601720762346895e-08,
      "loss": 0.0017,
      "num_tokens": 212099225.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2950
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9836666666666667,
      "grad_norm": 1.8324621819942877e-09,
      "kl": 0.0440673828125,
      "learning_rate": 1.6918417287318245e-08,
      "loss": 0.0018,
      "num_tokens": 212178393.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2951
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.984,
      "grad_norm": 1.5053276403165228e-09,
      "kl": 0.04345703125,
      "learning_rate": 1.624862945766692e-08,
      "loss": 0.0017,
      "num_tokens": 212254041.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2952
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9843333333333333,
      "grad_norm": 1.5199475011939967e-09,
      "kl": 0.04852294921875,
      "learning_rate": 1.5592358180189782e-08,
      "loss": 0.0019,
      "num_tokens": 212329129.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2953
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9846666666666667,
      "grad_norm": 5.194201957436917e-09,
      "kl": 0.0499267578125,
      "learning_rate": 1.4949604343383882e-08,
      "loss": 0.002,
      "num_tokens": 212407929.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2954
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.985,
      "grad_norm": 2.071116611546131e-09,
      "kl": 0.04473876953125,
      "learning_rate": 1.4320368817443142e-08,
      "loss": 0.0018,
      "num_tokens": 212487513.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2955
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9853333333333333,
      "grad_norm": 9.866411021519639e-10,
      "kl": 0.044189453125,
      "learning_rate": 1.370465245426167e-08,
      "loss": 0.0018,
      "num_tokens": 212560921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2956
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9856666666666667,
      "grad_norm": 2.2598105609006325e-09,
      "kl": 0.04339599609375,
      "learning_rate": 1.3102456087430437e-08,
      "loss": 0.0017,
      "num_tokens": 212640537.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2957
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.986,
      "grad_norm": 2.6744608749851295e-09,
      "kl": 0.04046630859375,
      "learning_rate": 1.2513780532236175e-08,
      "loss": 0.0016,
      "num_tokens": 212716841.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2958
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9863333333333333,
      "grad_norm": 1.0582833498418154e-09,
      "kl": 0.0440673828125,
      "learning_rate": 1.1938626585660252e-08,
      "loss": 0.0018,
      "num_tokens": 212792137.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2959
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9866666666666667,
      "grad_norm": 1.0848154596843074e-09,
      "kl": 0.04638671875,
      "learning_rate": 1.1376995026376459e-08,
      "loss": 0.0019,
      "num_tokens": 212866473.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2960
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.987,
      "grad_norm": 1.36530775574073e-09,
      "kl": 0.0452880859375,
      "learning_rate": 1.0828886614754342e-08,
      "loss": 0.0018,
      "num_tokens": 212939721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2961
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9873333333333333,
      "grad_norm": 1.657683545985833e-09,
      "kl": 0.04949951171875,
      "learning_rate": 1.0294302092853647e-08,
      "loss": 0.002,
      "num_tokens": 213017641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2962
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9876666666666667,
      "grad_norm": 1.4429349937117308e-09,
      "kl": 0.0435791015625,
      "learning_rate": 9.773242184422105e-09,
      "loss": 0.0017,
      "num_tokens": 213093305.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2963
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.988,
      "grad_norm": 3.4310256857850163e-09,
      "kl": 0.04547119140625,
      "learning_rate": 9.265707594899864e-09,
      "loss": 0.0018,
      "num_tokens": 213170377.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2964
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9883333333333333,
      "grad_norm": 1.935559934551634e-09,
      "kl": 0.0472412109375,
      "learning_rate": 8.771699011416169e-09,
      "loss": 0.0019,
      "num_tokens": 213247577.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2965
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9886666666666667,
      "grad_norm": 1.523074333320551e-09,
      "kl": 0.043701171875,
      "learning_rate": 8.29121710278713e-09,
      "loss": 0.0017,
      "num_tokens": 213322633.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2966
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.989,
      "grad_norm": 1.2285142902968005e-09,
      "kl": 0.04644775390625,
      "learning_rate": 7.824262519514625e-09,
      "loss": 0.0019,
      "num_tokens": 213398233.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2967
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9893333333333333,
      "grad_norm": 9.537842737827873e-10,
      "kl": 0.04425048828125,
      "learning_rate": 7.370835893788508e-09,
      "loss": 0.0018,
      "num_tokens": 213473689.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2968
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9896666666666667,
      "grad_norm": 1.1185187220874582e-09,
      "kl": 0.042236328125,
      "learning_rate": 6.930937839481067e-09,
      "loss": 0.0017,
      "num_tokens": 213550489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2969
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.99,
      "grad_norm": 1.2443580610366212e-09,
      "kl": 0.0458984375,
      "learning_rate": 6.504568952152568e-09,
      "loss": 0.0018,
      "num_tokens": 213624761.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2970
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9903333333333333,
      "grad_norm": 1.9045065524636584e-09,
      "kl": 0.04449462890625,
      "learning_rate": 6.091729809042379e-09,
      "loss": 0.0018,
      "num_tokens": 213700953.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2971
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9906666666666667,
      "grad_norm": 1.3548655530826181e-09,
      "kl": 0.04327392578125,
      "learning_rate": 5.6924209690767395e-09,
      "loss": 0.0017,
      "num_tokens": 213775721.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2972
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.991,
      "grad_norm": 1.8117720657073733e-09,
      "kl": 0.0498046875,
      "learning_rate": 5.306642972862097e-09,
      "loss": 0.002,
      "num_tokens": 213852889.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2973
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9913333333333333,
      "grad_norm": 1.3182867020233857e-09,
      "kl": 0.04449462890625,
      "learning_rate": 4.9343963426840006e-09,
      "loss": 0.0018,
      "num_tokens": 213927913.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2974
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9916666666666667,
      "grad_norm": 2.8226203596659616e-09,
      "kl": 0.04742431640625,
      "learning_rate": 4.575681582512648e-09,
      "loss": 0.0019,
      "num_tokens": 214003529.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2975
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.992,
      "grad_norm": 3.2701474861340785e-09,
      "kl": 0.04498291015625,
      "learning_rate": 4.230499177994007e-09,
      "loss": 0.0018,
      "num_tokens": 214081433.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2976
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9923333333333333,
      "grad_norm": 2.0030501701739922e-09,
      "kl": 0.044921875,
      "learning_rate": 3.898849596456477e-09,
      "loss": 0.0018,
      "num_tokens": 214155545.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2977
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9926666666666667,
      "grad_norm": 1.888650125181357e-09,
      "kl": 0.0489501953125,
      "learning_rate": 3.5807332869042256e-09,
      "loss": 0.002,
      "num_tokens": 214231993.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2978
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.993,
      "grad_norm": 2.888982386650696e-09,
      "kl": 0.04742431640625,
      "learning_rate": 3.276150680021628e-09,
      "loss": 0.0019,
      "num_tokens": 214311561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2979
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9933333333333333,
      "grad_norm": 1.0059407751228377e-09,
      "kl": 0.04742431640625,
      "learning_rate": 2.9851021881688314e-09,
      "loss": 0.0019,
      "num_tokens": 214385641.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2980
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9936666666666667,
      "grad_norm": 1.671291660620966e-09,
      "kl": 0.048828125,
      "learning_rate": 2.7075882053828605e-09,
      "loss": 0.002,
      "num_tokens": 214461945.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2981
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.994,
      "grad_norm": 2.2417905309879416e-09,
      "kl": 0.04388427734375,
      "learning_rate": 2.4436091073787304e-09,
      "loss": 0.0018,
      "num_tokens": 214541289.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2982
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9943333333333333,
      "grad_norm": 2.0122878918726883e-09,
      "kl": 0.045654296875,
      "learning_rate": 2.193165251545004e-09,
      "loss": 0.0018,
      "num_tokens": 214615785.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2983
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9946666666666667,
      "grad_norm": 1.3068360837920068e-09,
      "kl": 0.04376220703125,
      "learning_rate": 1.956256976947124e-09,
      "loss": 0.0018,
      "num_tokens": 214691385.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2984
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.995,
      "grad_norm": 1.7166823518266483e-09,
      "kl": 0.04534912109375,
      "learning_rate": 1.7328846043229707e-09,
      "loss": 0.0018,
      "num_tokens": 214767897.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2985
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9953333333333333,
      "grad_norm": 1.6194776630840124e-09,
      "kl": 0.0521240234375,
      "learning_rate": 1.5230484360873043e-09,
      "loss": 0.0021,
      "num_tokens": 214844217.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2986
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9956666666666667,
      "grad_norm": 1.1248711961897584e-09,
      "kl": 0.04620361328125,
      "learning_rate": 1.3267487563284332e-09,
      "loss": 0.0019,
      "num_tokens": 214922697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2987
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.996,
      "grad_norm": 9.915434029394987e-10,
      "kl": 0.04498291015625,
      "learning_rate": 1.1439858308071038e-09,
      "loss": 0.0018,
      "num_tokens": 214997449.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2988
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9963333333333333,
      "grad_norm": 1.6820052017862963e-09,
      "kl": 0.04534912109375,
      "learning_rate": 9.74759906957612e-10,
      "loss": 0.0018,
      "num_tokens": 215072921.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2989
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9966666666666667,
      "grad_norm": 2.472805737951944e-09,
      "kl": 0.043701171875,
      "learning_rate": 8.19071213887801e-10,
      "loss": 0.0017,
      "num_tokens": 215152937.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2990
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.997,
      "grad_norm": 1.426590956477014e-09,
      "kl": 0.0416259765625,
      "learning_rate": 6.769199623779532e-10,
      "loss": 0.0017,
      "num_tokens": 215227705.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2991
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9973333333333333,
      "grad_norm": 1.0423627516686906e-09,
      "kl": 0.0418701171875,
      "learning_rate": 5.483063448785686e-10,
      "loss": 0.0017,
      "num_tokens": 215303561.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2992
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9976666666666667,
      "grad_norm": 1.9338459722462176e-09,
      "kl": 0.0458984375,
      "learning_rate": 4.332305355159161e-10,
      "loss": 0.0018,
      "num_tokens": 215383241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2993
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.998,
      "grad_norm": 2.1644668279918733e-09,
      "kl": 0.0440673828125,
      "learning_rate": 3.316926900842621e-10,
      "loss": 0.0018,
      "num_tokens": 215461625.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2994
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9983333333333333,
      "grad_norm": 1.713874042685859e-09,
      "kl": 0.04571533203125,
      "learning_rate": 2.436929460525317e-10,
      "loss": 0.0018,
      "num_tokens": 215538601.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2995
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9986666666666667,
      "grad_norm": 1.55350254882336e-09,
      "kl": 0.04632568359375,
      "learning_rate": 1.6923142255764745e-10,
      "loss": 0.0019,
      "num_tokens": 215612489.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2996
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.999,
      "grad_norm": 2.735165649525584e-09,
      "kl": 0.04730224609375,
      "learning_rate": 1.0830822041230093e-10,
      "loss": 0.0019,
      "num_tokens": 215688697.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2997
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9993333333333333,
      "grad_norm": 2.0403514433553482e-09,
      "kl": 0.0482177734375,
      "learning_rate": 6.092342209607083e-11,
      "loss": 0.0019,
      "num_tokens": 215763817.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2998
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 0.9996666666666667,
      "grad_norm": 1.1584365688932508e-09,
      "kl": 0.0440673828125,
      "learning_rate": 2.7077091762084396e-11,
      "loss": 0.0018,
      "num_tokens": 215839241.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 2999
    },
    {
      "clip_ratio": 0.0,
      "completions/clipped_ratio": 1.0,
      "completions/max_length": 1024.0,
      "completions/max_terminated_length": 0.0,
      "completions/mean_length": 1024.0,
      "completions/mean_terminated_length": 0.0,
      "completions/min_length": 1024.0,
      "completions/min_terminated_length": 0.0,
      "epoch": 1.0,
      "grad_norm": 1.9258936667654325e-09,
      "kl": 0.0445556640625,
      "learning_rate": 6.7692752314663104e-12,
      "loss": 0.0018,
      "num_tokens": 215917753.0,
      "reward": 0.0,
      "reward_std": 0.0,
      "rewards/format_reward/mean": 0.0,
      "rewards/format_reward/std": 0.0,
      "rewards/penalized_accuracy_reward/mean": 0.0,
      "rewards/penalized_accuracy_reward/std": 0.0,
      "rewards/reasoning_steps_reward/mean": 0.0,
      "rewards/reasoning_steps_reward/std": 0.0,
      "rewards/tag_count_reward/mean": 0.0,
      "rewards/tag_count_reward/std": 0.0,
      "step": 3000
    },
    {
      "epoch": 1.0,
      "step": 3000,
      "total_flos": 0.0,
      "train_loss": 0.01067225778910021,
      "train_runtime": 90873.507,
      "train_samples_per_second": 0.132,
      "train_steps_per_second": 0.033
    }
  ],
  "logging_steps": 1,
  "max_steps": 3000,
  "num_input_tokens_seen": 215917753,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 16,
  "trial_name": null,
  "trial_params": null
}