| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 100, | |
| "global_step": 600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.7140417098999023, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.7338, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.1201739311218262, | |
| "learning_rate": 3.1666666666666666e-05, | |
| "loss": 0.3534, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.3740745782852173, | |
| "learning_rate": 4.8333333333333334e-05, | |
| "loss": 0.1776, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.7445772886276245, | |
| "learning_rate": 6.500000000000001e-05, | |
| "loss": 0.1226, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.6376151442527771, | |
| "learning_rate": 8.166666666666667e-05, | |
| "loss": 0.1086, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.37051457166671753, | |
| "learning_rate": 9.833333333333333e-05, | |
| "loss": 0.0996, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.5768002867698669, | |
| "learning_rate": 9.99314767377287e-05, | |
| "loss": 0.0959, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.5846573114395142, | |
| "learning_rate": 9.9694847320726e-05, | |
| "loss": 0.0673, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.5731233954429626, | |
| "learning_rate": 9.929006627092299e-05, | |
| "loss": 0.1235, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.7275082468986511, | |
| "learning_rate": 9.871850323926177e-05, | |
| "loss": 0.0779, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 0.0684906542301178, | |
| "eval_runtime": 263.652, | |
| "eval_samples_per_second": 0.379, | |
| "eval_steps_per_second": 0.379, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.5399947762489319, | |
| "learning_rate": 9.798209221411747e-05, | |
| "loss": 0.0591, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.39991146326065063, | |
| "learning_rate": 9.708332497729378e-05, | |
| "loss": 0.0841, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.4473568797111511, | |
| "learning_rate": 9.602524267262203e-05, | |
| "loss": 0.0823, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.6712111234664917, | |
| "learning_rate": 9.481142551569318e-05, | |
| "loss": 0.0686, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.3721703886985779, | |
| "learning_rate": 9.344598067954152e-05, | |
| "loss": 0.0658, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.43187215924263, | |
| "learning_rate": 9.193352839727121e-05, | |
| "loss": 0.0669, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.3821048140525818, | |
| "learning_rate": 9.027918632864997e-05, | |
| "loss": 0.0898, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.7914325594902039, | |
| "learning_rate": 8.848855224356839e-05, | |
| "loss": 0.0777, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.49602773785591125, | |
| "learning_rate": 8.656768508095853e-05, | |
| "loss": 0.0602, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.36420896649360657, | |
| "learning_rate": 8.452308444726249e-05, | |
| "loss": 0.0647, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.05108840763568878, | |
| "eval_runtime": 263.56, | |
| "eval_samples_per_second": 0.379, | |
| "eval_steps_per_second": 0.379, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 0.29956021904945374, | |
| "learning_rate": 8.236166862382163e-05, | |
| "loss": 0.0381, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.3197619915008545, | |
| "learning_rate": 8.009075115760243e-05, | |
| "loss": 0.0478, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 0.43458399176597595, | |
| "learning_rate": 7.771801611446858e-05, | |
| "loss": 0.0349, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.3282864987850189, | |
| "learning_rate": 7.52514920787345e-05, | |
| "loss": 0.0309, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.24097760021686554, | |
| "learning_rate": 7.269952498697734e-05, | |
| "loss": 0.0209, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 0.31138163805007935, | |
| "learning_rate": 7.007074988802946e-05, | |
| "loss": 0.0414, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 0.2666899859905243, | |
| "learning_rate": 6.737406172470657e-05, | |
| "loss": 0.0592, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.513999879360199, | |
| "learning_rate": 6.461858523613684e-05, | |
| "loss": 0.0328, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 0.2508058547973633, | |
| "learning_rate": 6.181364408253209e-05, | |
| "loss": 0.0256, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.22155922651290894, | |
| "learning_rate": 5.8968729296872874e-05, | |
| "loss": 0.0292, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "eval_loss": 0.04999532178044319, | |
| "eval_runtime": 263.0996, | |
| "eval_samples_per_second": 0.38, | |
| "eval_steps_per_second": 0.38, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 0.5395333170890808, | |
| "learning_rate": 5.6093467170257374e-05, | |
| "loss": 0.0468, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.28358444571495056, | |
| "learning_rate": 5.319758667957928e-05, | |
| "loss": 0.0418, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 0.18217401206493378, | |
| "learning_rate": 5.0290886567749696e-05, | |
| "loss": 0.0227, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.4007870852947235, | |
| "learning_rate": 4.738320218785281e-05, | |
| "loss": 0.0383, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.20039933919906616, | |
| "learning_rate": 4.4484372223424415e-05, | |
| "loss": 0.0371, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.12333797663450241, | |
| "learning_rate": 4.160420539746115e-05, | |
| "loss": 0.0331, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 0.3924310803413391, | |
| "learning_rate": 3.875244728280676e-05, | |
| "loss": 0.0396, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 0.07961348444223404, | |
| "learning_rate": 3.593874732621847e-05, | |
| "loss": 0.033, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 0.5689805746078491, | |
| "learning_rate": 3.317262619769368e-05, | |
| "loss": 0.0336, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.0888177827000618, | |
| "learning_rate": 3.046344357553632e-05, | |
| "loss": 0.028, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.04492698982357979, | |
| "eval_runtime": 263.4857, | |
| "eval_samples_per_second": 0.38, | |
| "eval_steps_per_second": 0.38, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 0.4522015452384949, | |
| "learning_rate": 2.7820366476168224e-05, | |
| "loss": 0.0177, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 0.037934061139822006, | |
| "learning_rate": 2.52523382358473e-05, | |
| "loss": 0.0143, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 0.19100028276443481, | |
| "learning_rate": 2.2768048249248648e-05, | |
| "loss": 0.013, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 0.2139115184545517, | |
| "learning_rate": 2.0375902567303472e-05, | |
| "loss": 0.0105, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.31156909465789795, | |
| "learning_rate": 1.80839954537836e-05, | |
| "loss": 0.0136, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 0.08543579280376434, | |
| "learning_rate": 1.5900081996875083e-05, | |
| "loss": 0.0196, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 0.10310215502977371, | |
| "learning_rate": 1.3831551868414599e-05, | |
| "loss": 0.017, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.2742729187011719, | |
| "learning_rate": 1.1885404319579108e-05, | |
| "loss": 0.0254, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.09017336368560791, | |
| "learning_rate": 1.006822449763537e-05, | |
| "loss": 0.0174, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.07442392408847809, | |
| "learning_rate": 8.38616116388612e-06, | |
| "loss": 0.013, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "eval_loss": 0.048839278519153595, | |
| "eval_runtime": 284.8522, | |
| "eval_samples_per_second": 0.351, | |
| "eval_steps_per_second": 0.351, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 0.29669731855392456, | |
| "learning_rate": 6.844905888208181e-06, | |
| "loss": 0.015, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.3099665939807892, | |
| "learning_rate": 5.449673790581611e-06, | |
| "loss": 0.0148, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 0.20635627210140228, | |
| "learning_rate": 4.205185894774455e-06, | |
| "loss": 0.0156, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.2796818017959595, | |
| "learning_rate": 3.115653153892761e-06, | |
| "loss": 0.015, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.15870630741119385, | |
| "learning_rate": 2.1847622018482283e-06, | |
| "loss": 0.0144, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.23704785108566284, | |
| "learning_rate": 1.4156628789559922e-06, | |
| "loss": 0.0241, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 0.025902193039655685, | |
| "learning_rate": 8.10957573872062e-07, | |
| "loss": 0.0096, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 0.04191463440656662, | |
| "learning_rate": 3.7269241793390085e-07, | |
| "loss": 0.0086, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 0.04461716488003731, | |
| "learning_rate": 1.0235036169963242e-07, | |
| "loss": 0.0115, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.21583612263202667, | |
| "learning_rate": 8.461571127882373e-10, | |
| "loss": 0.0116, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.04811210185289383, | |
| "eval_runtime": 284.4632, | |
| "eval_samples_per_second": 0.352, | |
| "eval_steps_per_second": 0.352, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 600, | |
| "total_flos": 3.3505112222539776e+16, | |
| "train_loss": 0.005030520980556806, | |
| "train_runtime": 5996.0343, | |
| "train_samples_per_second": 0.4, | |
| "train_steps_per_second": 0.1 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 600, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.3505112222539776e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |