| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.1569197239375226, | |
| "eval_steps": 100, | |
| "global_step": 200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02905920813657828, | |
| "grad_norm": 1.8251163959503174, | |
| "learning_rate": 1.2903225806451613e-05, | |
| "loss": 2.6703, | |
| "num_input_tokens_seen": 686720, | |
| "step": 5, | |
| "train_runtime": 395.6486, | |
| "train_tokens_per_second": 1735.682 | |
| }, | |
| { | |
| "epoch": 0.05811841627315656, | |
| "grad_norm": 1.6785106658935547, | |
| "learning_rate": 2.9032258064516133e-05, | |
| "loss": 2.6024, | |
| "num_input_tokens_seen": 1373472, | |
| "step": 10, | |
| "train_runtime": 791.2815, | |
| "train_tokens_per_second": 1735.757 | |
| }, | |
| { | |
| "epoch": 0.08717762440973484, | |
| "grad_norm": 1.5060399770736694, | |
| "learning_rate": 4.516129032258064e-05, | |
| "loss": 2.2899, | |
| "num_input_tokens_seen": 2061472, | |
| "step": 15, | |
| "train_runtime": 1188.0329, | |
| "train_tokens_per_second": 1735.198 | |
| }, | |
| { | |
| "epoch": 0.11623683254631312, | |
| "grad_norm": 0.8076892495155334, | |
| "learning_rate": 6.129032258064517e-05, | |
| "loss": 1.7635, | |
| "num_input_tokens_seen": 2748800, | |
| "step": 20, | |
| "train_runtime": 1583.7612, | |
| "train_tokens_per_second": 1735.615 | |
| }, | |
| { | |
| "epoch": 0.1452960406828914, | |
| "grad_norm": 0.5285284519195557, | |
| "learning_rate": 7.741935483870968e-05, | |
| "loss": 1.4089, | |
| "num_input_tokens_seen": 3436480, | |
| "step": 25, | |
| "train_runtime": 1980.3723, | |
| "train_tokens_per_second": 1735.27 | |
| }, | |
| { | |
| "epoch": 0.17435524881946968, | |
| "grad_norm": 0.4370571970939636, | |
| "learning_rate": 9.35483870967742e-05, | |
| "loss": 1.2673, | |
| "num_input_tokens_seen": 4122560, | |
| "step": 30, | |
| "train_runtime": 2375.5923, | |
| "train_tokens_per_second": 1735.382 | |
| }, | |
| { | |
| "epoch": 0.20341445695604796, | |
| "grad_norm": 0.3576990067958832, | |
| "learning_rate": 9.99906754234138e-05, | |
| "loss": 1.1446, | |
| "num_input_tokens_seen": 4810240, | |
| "step": 35, | |
| "train_runtime": 2771.8387, | |
| "train_tokens_per_second": 1735.397 | |
| }, | |
| { | |
| "epoch": 0.23247366509262624, | |
| "grad_norm": 0.2466888129711151, | |
| "learning_rate": 9.993370449424153e-05, | |
| "loss": 1.0947, | |
| "num_input_tokens_seen": 5498080, | |
| "step": 40, | |
| "train_runtime": 3168.1072, | |
| "train_tokens_per_second": 1735.446 | |
| }, | |
| { | |
| "epoch": 0.2615328732292045, | |
| "grad_norm": 0.23221422731876373, | |
| "learning_rate": 9.982500190692845e-05, | |
| "loss": 1.0456, | |
| "num_input_tokens_seen": 6187968, | |
| "step": 45, | |
| "train_runtime": 3566.2467, | |
| "train_tokens_per_second": 1735.149 | |
| }, | |
| { | |
| "epoch": 0.2905920813657828, | |
| "grad_norm": 0.26600706577301025, | |
| "learning_rate": 9.966468027809582e-05, | |
| "loss": 1.0029, | |
| "num_input_tokens_seen": 6875040, | |
| "step": 50, | |
| "train_runtime": 3962.2914, | |
| "train_tokens_per_second": 1735.117 | |
| }, | |
| { | |
| "epoch": 0.31965128950236105, | |
| "grad_norm": 0.24320659041404724, | |
| "learning_rate": 9.945290570204359e-05, | |
| "loss": 0.974, | |
| "num_input_tokens_seen": 7561952, | |
| "step": 55, | |
| "train_runtime": 4357.8907, | |
| "train_tokens_per_second": 1735.232 | |
| }, | |
| { | |
| "epoch": 0.34871049763893935, | |
| "grad_norm": 0.22472068667411804, | |
| "learning_rate": 9.918989757867583e-05, | |
| "loss": 0.944, | |
| "num_input_tokens_seen": 8248800, | |
| "step": 60, | |
| "train_runtime": 4753.5334, | |
| "train_tokens_per_second": 1735.299 | |
| }, | |
| { | |
| "epoch": 0.3777697057755176, | |
| "grad_norm": 0.268387109041214, | |
| "learning_rate": 9.88759283862006e-05, | |
| "loss": 0.9328, | |
| "num_input_tokens_seen": 8937280, | |
| "step": 65, | |
| "train_runtime": 5150.3485, | |
| "train_tokens_per_second": 1735.277 | |
| }, | |
| { | |
| "epoch": 0.4068289139120959, | |
| "grad_norm": 0.21440809965133667, | |
| "learning_rate": 9.851132339884096e-05, | |
| "loss": 0.9074, | |
| "num_input_tokens_seen": 9625248, | |
| "step": 70, | |
| "train_runtime": 5546.5992, | |
| "train_tokens_per_second": 1735.342 | |
| }, | |
| { | |
| "epoch": 0.43588812204867416, | |
| "grad_norm": 0.228573739528656, | |
| "learning_rate": 9.80964603498485e-05, | |
| "loss": 0.8937, | |
| "num_input_tokens_seen": 10312960, | |
| "step": 75, | |
| "train_runtime": 5943.0082, | |
| "train_tokens_per_second": 1735.31 | |
| }, | |
| { | |
| "epoch": 0.46494733018525247, | |
| "grad_norm": 0.21550454199314117, | |
| "learning_rate": 9.763176904016913e-05, | |
| "loss": 0.8789, | |
| "num_input_tokens_seen": 11001696, | |
| "step": 80, | |
| "train_runtime": 6340.0803, | |
| "train_tokens_per_second": 1735.261 | |
| }, | |
| { | |
| "epoch": 0.4940065383218307, | |
| "grad_norm": 0.23164565861225128, | |
| "learning_rate": 9.711773089316645e-05, | |
| "loss": 0.8684, | |
| "num_input_tokens_seen": 11688192, | |
| "step": 85, | |
| "train_runtime": 6735.2127, | |
| "train_tokens_per_second": 1735.386 | |
| }, | |
| { | |
| "epoch": 0.523065746458409, | |
| "grad_norm": 0.2465435415506363, | |
| "learning_rate": 9.655487845586377e-05, | |
| "loss": 0.8422, | |
| "num_input_tokens_seen": 12375296, | |
| "step": 90, | |
| "train_runtime": 7131.144, | |
| "train_tokens_per_second": 1735.387 | |
| }, | |
| { | |
| "epoch": 0.5521249545949873, | |
| "grad_norm": 0.24671192467212677, | |
| "learning_rate": 9.594379484722184e-05, | |
| "loss": 0.8408, | |
| "num_input_tokens_seen": 13063552, | |
| "step": 95, | |
| "train_runtime": 7528.0327, | |
| "train_tokens_per_second": 1735.321 | |
| }, | |
| { | |
| "epoch": 0.5811841627315656, | |
| "grad_norm": 0.2649816572666168, | |
| "learning_rate": 9.528511315402358e-05, | |
| "loss": 0.8422, | |
| "num_input_tokens_seen": 13751648, | |
| "step": 100, | |
| "train_runtime": 7924.8612, | |
| "train_tokens_per_second": 1735.254 | |
| }, | |
| { | |
| "epoch": 0.5811841627315656, | |
| "eval_loss": 0.8274134397506714, | |
| "eval_runtime": 872.0056, | |
| "eval_samples_per_second": 6.314, | |
| "eval_steps_per_second": 1.579, | |
| "num_input_tokens_seen": 13751648, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6102433708681438, | |
| "grad_norm": 0.2663179636001587, | |
| "learning_rate": 9.457951577499187e-05, | |
| "loss": 0.8217, | |
| "num_input_tokens_seen": 14438496, | |
| "step": 105, | |
| "train_runtime": 9194.7951, | |
| "train_tokens_per_second": 1570.29 | |
| }, | |
| { | |
| "epoch": 0.6393025790047221, | |
| "grad_norm": 0.2964800000190735, | |
| "learning_rate": 9.382773371381985e-05, | |
| "loss": 0.8018, | |
| "num_input_tokens_seen": 15126496, | |
| "step": 110, | |
| "train_runtime": 9591.6416, | |
| "train_tokens_per_second": 1577.05 | |
| }, | |
| { | |
| "epoch": 0.6683617871413003, | |
| "grad_norm": 0.28969624638557434, | |
| "learning_rate": 9.303054582184609e-05, | |
| "loss": 0.8072, | |
| "num_input_tokens_seen": 15815136, | |
| "step": 115, | |
| "train_runtime": 9989.0582, | |
| "train_tokens_per_second": 1583.246 | |
| }, | |
| { | |
| "epoch": 0.6974209952778787, | |
| "grad_norm": 0.30194368958473206, | |
| "learning_rate": 9.218877799115928e-05, | |
| "loss": 0.8014, | |
| "num_input_tokens_seen": 16503360, | |
| "step": 120, | |
| "train_runtime": 10386.1584, | |
| "train_tokens_per_second": 1588.976 | |
| }, | |
| { | |
| "epoch": 0.726480203414457, | |
| "grad_norm": 0.2715190052986145, | |
| "learning_rate": 9.130330229896847e-05, | |
| "loss": 0.7902, | |
| "num_input_tokens_seen": 17190176, | |
| "step": 125, | |
| "train_runtime": 10782.1528, | |
| "train_tokens_per_second": 1594.318 | |
| }, | |
| { | |
| "epoch": 0.7555394115510352, | |
| "grad_norm": 0.2829165756702423, | |
| "learning_rate": 9.037503610412501e-05, | |
| "loss": 0.7874, | |
| "num_input_tokens_seen": 17877120, | |
| "step": 130, | |
| "train_runtime": 11178.1048, | |
| "train_tokens_per_second": 1599.298 | |
| }, | |
| { | |
| "epoch": 0.7845986196876135, | |
| "grad_norm": 0.3267139196395874, | |
| "learning_rate": 8.940494109673265e-05, | |
| "loss": 0.7963, | |
| "num_input_tokens_seen": 18563488, | |
| "step": 135, | |
| "train_runtime": 11573.6201, | |
| "train_tokens_per_second": 1603.948 | |
| }, | |
| { | |
| "epoch": 0.8136578278241918, | |
| "grad_norm": 0.31520357728004456, | |
| "learning_rate": 8.839402230183e-05, | |
| "loss": 0.7822, | |
| "num_input_tokens_seen": 19253216, | |
| "step": 140, | |
| "train_runtime": 11971.6869, | |
| "train_tokens_per_second": 1608.229 | |
| }, | |
| { | |
| "epoch": 0.8427170359607701, | |
| "grad_norm": 0.30459001660346985, | |
| "learning_rate": 8.734332703817771e-05, | |
| "loss": 0.7859, | |
| "num_input_tokens_seen": 19941568, | |
| "step": 145, | |
| "train_runtime": 12368.4401, | |
| "train_tokens_per_second": 1612.294 | |
| }, | |
| { | |
| "epoch": 0.8717762440973483, | |
| "grad_norm": 0.32623955607414246, | |
| "learning_rate": 8.625394383322914e-05, | |
| "loss": 0.7687, | |
| "num_input_tokens_seen": 20629312, | |
| "step": 150, | |
| "train_runtime": 12764.6653, | |
| "train_tokens_per_second": 1616.126 | |
| }, | |
| { | |
| "epoch": 0.9008354522339266, | |
| "grad_norm": 0.32089152932167053, | |
| "learning_rate": 8.512700129540847e-05, | |
| "loss": 0.7672, | |
| "num_input_tokens_seen": 21315136, | |
| "step": 155, | |
| "train_runtime": 13160.0163, | |
| "train_tokens_per_second": 1619.689 | |
| }, | |
| { | |
| "epoch": 0.9298946603705049, | |
| "grad_norm": 0.3055724799633026, | |
| "learning_rate": 8.396366694486466e-05, | |
| "loss": 0.7639, | |
| "num_input_tokens_seen": 22002976, | |
| "step": 160, | |
| "train_runtime": 13557.0617, | |
| "train_tokens_per_second": 1622.99 | |
| }, | |
| { | |
| "epoch": 0.9589538685070832, | |
| "grad_norm": 0.30428361892700195, | |
| "learning_rate": 8.276514600391272e-05, | |
| "loss": 0.7617, | |
| "num_input_tokens_seen": 22690560, | |
| "step": 165, | |
| "train_runtime": 13953.7665, | |
| "train_tokens_per_second": 1626.124 | |
| }, | |
| { | |
| "epoch": 0.9880130766436614, | |
| "grad_norm": 0.3108614981174469, | |
| "learning_rate": 8.153268014841506e-05, | |
| "loss": 0.7613, | |
| "num_input_tokens_seen": 23378048, | |
| "step": 170, | |
| "train_runtime": 14350.762, | |
| "train_tokens_per_second": 1629.046 | |
| }, | |
| { | |
| "epoch": 1.0116236832546313, | |
| "grad_norm": 0.3532414436340332, | |
| "learning_rate": 8.026754622139691e-05, | |
| "loss": 0.7645, | |
| "num_input_tokens_seen": 23937248, | |
| "step": 175, | |
| "train_runtime": 14673.5871, | |
| "train_tokens_per_second": 1631.315 | |
| }, | |
| { | |
| "epoch": 1.0406828913912096, | |
| "grad_norm": 0.32768887281417847, | |
| "learning_rate": 7.897105491022818e-05, | |
| "loss": 0.7563, | |
| "num_input_tokens_seen": 24623744, | |
| "step": 180, | |
| "train_runtime": 15069.3557, | |
| "train_tokens_per_second": 1634.028 | |
| }, | |
| { | |
| "epoch": 1.069742099527788, | |
| "grad_norm": 0.310390830039978, | |
| "learning_rate": 7.764454938874252e-05, | |
| "loss": 0.7389, | |
| "num_input_tokens_seen": 25312576, | |
| "step": 185, | |
| "train_runtime": 15466.8535, | |
| "train_tokens_per_second": 1636.569 | |
| }, | |
| { | |
| "epoch": 1.0988013076643661, | |
| "grad_norm": 0.3332880139350891, | |
| "learning_rate": 7.628940392569994e-05, | |
| "loss": 0.7544, | |
| "num_input_tokens_seen": 25999584, | |
| "step": 190, | |
| "train_runtime": 15863.0121, | |
| "train_tokens_per_second": 1639.007 | |
| }, | |
| { | |
| "epoch": 1.1278605158009445, | |
| "grad_norm": 0.3539334237575531, | |
| "learning_rate": 7.490702246103513e-05, | |
| "loss": 0.7455, | |
| "num_input_tokens_seen": 26685632, | |
| "step": 195, | |
| "train_runtime": 16258.3382, | |
| "train_tokens_per_second": 1641.351 | |
| }, | |
| { | |
| "epoch": 1.1569197239375226, | |
| "grad_norm": 0.3256574869155884, | |
| "learning_rate": 7.3498837151366e-05, | |
| "loss": 0.7465, | |
| "num_input_tokens_seen": 27371456, | |
| "step": 200, | |
| "train_runtime": 16653.8315, | |
| "train_tokens_per_second": 1643.553 | |
| }, | |
| { | |
| "epoch": 1.1569197239375226, | |
| "eval_loss": 0.7508572340011597, | |
| "eval_runtime": 873.0587, | |
| "eval_samples_per_second": 6.307, | |
| "eval_steps_per_second": 1.577, | |
| "num_input_tokens_seen": 27371456, | |
| "step": 200 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 519, | |
| "num_input_tokens_seen": 27371456, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2501177571560653e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |