| { | |
| "best_metric": 1.1285432577133179, | |
| "best_model_checkpoint": "./ryan_model/checkpoint-100", | |
| "epoch": 4.0, | |
| "eval_steps": 100, | |
| "global_step": 152, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.8508069515228271, | |
| "learning_rate": 0.00018684210526315792, | |
| "loss": 1.7256, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 2.4252703189849854, | |
| "learning_rate": 0.0001736842105263158, | |
| "loss": 1.5727, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 2.249209403991699, | |
| "learning_rate": 0.0001605263157894737, | |
| "loss": 1.4208, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 2.09879207611084, | |
| "learning_rate": 0.0001486842105263158, | |
| "loss": 1.2251, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 1.9784414768218994, | |
| "learning_rate": 0.0001355263157894737, | |
| "loss": 0.9854, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.5055615901947021, | |
| "learning_rate": 0.00012236842105263157, | |
| "loss": 0.9456, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.551707148551941, | |
| "learning_rate": 0.00010921052631578947, | |
| "loss": 0.8805, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 1.3849328756332397, | |
| "learning_rate": 9.605263157894737e-05, | |
| "loss": 0.7414, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 1.9976824522018433, | |
| "learning_rate": 8.289473684210527e-05, | |
| "loss": 0.5372, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 2.4758734703063965, | |
| "learning_rate": 6.973684210526315e-05, | |
| "loss": 0.4821, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "eval_accuracy": 0.5583333333333333, | |
| "eval_loss": 1.1285432577133179, | |
| "eval_runtime": 60.0048, | |
| "eval_samples_per_second": 9.999, | |
| "eval_steps_per_second": 1.25, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 1.7585190534591675, | |
| "learning_rate": 5.6578947368421056e-05, | |
| "loss": 0.4428, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 0.9120551943778992, | |
| "learning_rate": 4.342105263157895e-05, | |
| "loss": 0.3698, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 0.7287651896476746, | |
| "learning_rate": 3.0263157894736844e-05, | |
| "loss": 0.2302, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 1.8238482475280762, | |
| "learning_rate": 1.7105263157894737e-05, | |
| "loss": 0.2082, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "grad_norm": 0.6237545609474182, | |
| "learning_rate": 3.9473684210526315e-06, | |
| "loss": 0.2266, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "step": 152, | |
| "total_flos": 1.85987442622464e+17, | |
| "train_loss": 0.7916242197940224, | |
| "train_runtime": 235.103, | |
| "train_samples_per_second": 10.208, | |
| "train_steps_per_second": 0.647 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 152, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 100, | |
| "total_flos": 1.85987442622464e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |