k1h0's picture
Upload folder using huggingface_hub
be55e08 verified
raw
history blame contribute delete
24.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.992108229988726,
"eval_steps": 500,
"global_step": 110,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009019165727170236,
"grad_norm": 0.8173778653144836,
"learning_rate": 4.9989804820704735e-05,
"loss": 0.8968,
"num_input_tokens_seen": 2097152,
"step": 1
},
{
"epoch": 0.018038331454340473,
"grad_norm": 0.5661506056785583,
"learning_rate": 4.995922759815339e-05,
"loss": 0.8202,
"num_input_tokens_seen": 4194304,
"step": 2
},
{
"epoch": 0.02705749718151071,
"grad_norm": 0.4142468571662903,
"learning_rate": 4.9908293271567286e-05,
"loss": 0.7783,
"num_input_tokens_seen": 6291456,
"step": 3
},
{
"epoch": 0.036076662908680945,
"grad_norm": 0.2738800048828125,
"learning_rate": 4.9837043383713753e-05,
"loss": 0.7642,
"num_input_tokens_seen": 8388608,
"step": 4
},
{
"epoch": 0.04509582863585118,
"grad_norm": 0.2011900395154953,
"learning_rate": 4.9745536047023324e-05,
"loss": 0.7475,
"num_input_tokens_seen": 10485760,
"step": 5
},
{
"epoch": 0.05411499436302142,
"grad_norm": 0.161564901471138,
"learning_rate": 4.963384589619233e-05,
"loss": 0.7319,
"num_input_tokens_seen": 12582912,
"step": 6
},
{
"epoch": 0.06313416009019165,
"grad_norm": 0.14298053085803986,
"learning_rate": 4.9502064027309836e-05,
"loss": 0.7125,
"num_input_tokens_seen": 14680064,
"step": 7
},
{
"epoch": 0.07215332581736189,
"grad_norm": 0.14268024265766144,
"learning_rate": 4.935029792355834e-05,
"loss": 0.7117,
"num_input_tokens_seen": 16777216,
"step": 8
},
{
"epoch": 0.08117249154453213,
"grad_norm": 0.1712426245212555,
"learning_rate": 4.917867136754893e-05,
"loss": 0.724,
"num_input_tokens_seen": 18874368,
"step": 9
},
{
"epoch": 0.09019165727170236,
"grad_norm": 0.15666206181049347,
"learning_rate": 4.898732434036244e-05,
"loss": 0.7144,
"num_input_tokens_seen": 20971520,
"step": 10
},
{
"epoch": 0.0992108229988726,
"grad_norm": 0.139165461063385,
"learning_rate": 4.877641290737884e-05,
"loss": 0.697,
"num_input_tokens_seen": 23068672,
"step": 11
},
{
"epoch": 0.10822998872604284,
"grad_norm": 0.15043741464614868,
"learning_rate": 4.854610909098812e-05,
"loss": 0.7117,
"num_input_tokens_seen": 25165824,
"step": 12
},
{
"epoch": 0.11724915445321307,
"grad_norm": 0.13478276133537292,
"learning_rate": 4.829660073028631e-05,
"loss": 0.6996,
"num_input_tokens_seen": 27262976,
"step": 13
},
{
"epoch": 0.1262683201803833,
"grad_norm": 0.12838861346244812,
"learning_rate": 4.802809132787125e-05,
"loss": 0.7172,
"num_input_tokens_seen": 29360128,
"step": 14
},
{
"epoch": 0.13528748590755355,
"grad_norm": 0.11803996562957764,
"learning_rate": 4.774079988386296e-05,
"loss": 0.725,
"num_input_tokens_seen": 31457280,
"step": 15
},
{
"epoch": 0.14430665163472378,
"grad_norm": 0.11841780692338943,
"learning_rate": 4.743496071728396e-05,
"loss": 0.7073,
"num_input_tokens_seen": 33554432,
"step": 16
},
{
"epoch": 0.15332581736189402,
"grad_norm": 0.11901742964982986,
"learning_rate": 4.711082327494536e-05,
"loss": 0.7021,
"num_input_tokens_seen": 35651584,
"step": 17
},
{
"epoch": 0.16234498308906425,
"grad_norm": 0.11239754408597946,
"learning_rate": 4.6768651927994434e-05,
"loss": 0.6894,
"num_input_tokens_seen": 37748736,
"step": 18
},
{
"epoch": 0.1713641488162345,
"grad_norm": 0.10254418104887009,
"learning_rate": 4.640872575628973e-05,
"loss": 0.7031,
"num_input_tokens_seen": 39845888,
"step": 19
},
{
"epoch": 0.18038331454340473,
"grad_norm": 0.09802790731191635,
"learning_rate": 4.6031338320779534e-05,
"loss": 0.6629,
"num_input_tokens_seen": 41943040,
"step": 20
},
{
"epoch": 0.18940248027057496,
"grad_norm": 0.0993066355586052,
"learning_rate": 4.563679742406935e-05,
"loss": 0.6774,
"num_input_tokens_seen": 44040192,
"step": 21
},
{
"epoch": 0.1984216459977452,
"grad_norm": 0.09300491958856583,
"learning_rate": 4.522542485937369e-05,
"loss": 0.7041,
"num_input_tokens_seen": 46137344,
"step": 22
},
{
"epoch": 0.20744081172491544,
"grad_norm": 0.08633296191692352,
"learning_rate": 4.479755614805688e-05,
"loss": 0.6894,
"num_input_tokens_seen": 48234496,
"step": 23
},
{
"epoch": 0.21645997745208567,
"grad_norm": 0.0841744914650917,
"learning_rate": 4.4353540265977064e-05,
"loss": 0.673,
"num_input_tokens_seen": 50331648,
"step": 24
},
{
"epoch": 0.2254791431792559,
"grad_norm": 0.09111865609884262,
"learning_rate": 4.389373935885646e-05,
"loss": 0.6691,
"num_input_tokens_seen": 52428800,
"step": 25
},
{
"epoch": 0.23449830890642615,
"grad_norm": 0.08103901892900467,
"learning_rate": 4.341852844691012e-05,
"loss": 0.6794,
"num_input_tokens_seen": 54525952,
"step": 26
},
{
"epoch": 0.24351747463359638,
"grad_norm": 0.08423160016536713,
"learning_rate": 4.292829511897409e-05,
"loss": 0.6946,
"num_input_tokens_seen": 56623104,
"step": 27
},
{
"epoch": 0.2525366403607666,
"grad_norm": 0.09218155592679977,
"learning_rate": 4.242343921638234e-05,
"loss": 0.6939,
"num_input_tokens_seen": 58720256,
"step": 28
},
{
"epoch": 0.2615558060879369,
"grad_norm": 0.07839576154947281,
"learning_rate": 4.1904372506850484e-05,
"loss": 0.6759,
"num_input_tokens_seen": 60817408,
"step": 29
},
{
"epoch": 0.2705749718151071,
"grad_norm": 0.08427103608846664,
"learning_rate": 4.137151834863213e-05,
"loss": 0.6856,
"num_input_tokens_seen": 62914560,
"step": 30
},
{
"epoch": 0.27959413754227735,
"grad_norm": 0.08769369125366211,
"learning_rate": 4.082531134522176e-05,
"loss": 0.6753,
"num_input_tokens_seen": 65011712,
"step": 31
},
{
"epoch": 0.28861330326944756,
"grad_norm": 0.09144359081983566,
"learning_rate": 4.0266196990885955e-05,
"loss": 0.6769,
"num_input_tokens_seen": 67108864,
"step": 32
},
{
"epoch": 0.2976324689966178,
"grad_norm": 0.08329298347234726,
"learning_rate": 3.969463130731183e-05,
"loss": 0.6664,
"num_input_tokens_seen": 69206016,
"step": 33
},
{
"epoch": 0.30665163472378804,
"grad_norm": 0.07759370654821396,
"learning_rate": 3.911108047166924e-05,
"loss": 0.6383,
"num_input_tokens_seen": 71303168,
"step": 34
},
{
"epoch": 0.3156708004509583,
"grad_norm": 0.07686188071966171,
"learning_rate": 3.851602043638994e-05,
"loss": 0.6726,
"num_input_tokens_seen": 73400320,
"step": 35
},
{
"epoch": 0.3246899661781285,
"grad_norm": 0.0853535607457161,
"learning_rate": 3.790993654097405e-05,
"loss": 0.6612,
"num_input_tokens_seen": 75497472,
"step": 36
},
{
"epoch": 0.3337091319052988,
"grad_norm": 0.08194194734096527,
"learning_rate": 3.72933231161401e-05,
"loss": 0.6763,
"num_input_tokens_seen": 77594624,
"step": 37
},
{
"epoch": 0.342728297632469,
"grad_norm": 0.0810617133975029,
"learning_rate": 3.6666683080641846e-05,
"loss": 0.6533,
"num_input_tokens_seen": 79691776,
"step": 38
},
{
"epoch": 0.35174746335963925,
"grad_norm": 0.08145678043365479,
"learning_rate": 3.603052753108053e-05,
"loss": 0.6598,
"num_input_tokens_seen": 81788928,
"step": 39
},
{
"epoch": 0.36076662908680945,
"grad_norm": 0.07560884952545166,
"learning_rate": 3.5385375325047166e-05,
"loss": 0.6801,
"num_input_tokens_seen": 83886080,
"step": 40
},
{
"epoch": 0.3697857948139797,
"grad_norm": 0.06700561195611954,
"learning_rate": 3.4731752657934794e-05,
"loss": 0.6668,
"num_input_tokens_seen": 85983232,
"step": 41
},
{
"epoch": 0.3788049605411499,
"grad_norm": 0.076134592294693,
"learning_rate": 3.4070192633766025e-05,
"loss": 0.6756,
"num_input_tokens_seen": 88080384,
"step": 42
},
{
"epoch": 0.3878241262683202,
"grad_norm": 0.06908991187810898,
"learning_rate": 3.3401234830385756e-05,
"loss": 0.6589,
"num_input_tokens_seen": 90177536,
"step": 43
},
{
"epoch": 0.3968432919954904,
"grad_norm": 0.07035559415817261,
"learning_rate": 3.272542485937369e-05,
"loss": 0.6435,
"num_input_tokens_seen": 92274688,
"step": 44
},
{
"epoch": 0.40586245772266066,
"grad_norm": 0.06883817166090012,
"learning_rate": 3.2043313921035743e-05,
"loss": 0.6602,
"num_input_tokens_seen": 94371840,
"step": 45
},
{
"epoch": 0.41488162344983087,
"grad_norm": 0.06745729595422745,
"learning_rate": 3.135545835483718e-05,
"loss": 0.648,
"num_input_tokens_seen": 96468992,
"step": 46
},
{
"epoch": 0.42390078917700114,
"grad_norm": 0.06895168870687485,
"learning_rate": 3.0662419185644115e-05,
"loss": 0.6493,
"num_input_tokens_seen": 98566144,
"step": 47
},
{
"epoch": 0.43291995490417134,
"grad_norm": 0.06701447814702988,
"learning_rate": 2.996476166614364e-05,
"loss": 0.6701,
"num_input_tokens_seen": 100663296,
"step": 48
},
{
"epoch": 0.4419391206313416,
"grad_norm": 0.07372091710567474,
"learning_rate": 2.92630548158156e-05,
"loss": 0.6674,
"num_input_tokens_seen": 102760448,
"step": 49
},
{
"epoch": 0.4509582863585118,
"grad_norm": 0.07131503522396088,
"learning_rate": 2.8557870956832132e-05,
"loss": 0.6635,
"num_input_tokens_seen": 104857600,
"step": 50
},
{
"epoch": 0.4599774520856821,
"grad_norm": 0.0702953040599823,
"learning_rate": 2.7849785247263515e-05,
"loss": 0.6598,
"num_input_tokens_seen": 106954752,
"step": 51
},
{
"epoch": 0.4689966178128523,
"grad_norm": 0.06502864509820938,
"learning_rate": 2.7139375211970996e-05,
"loss": 0.6732,
"num_input_tokens_seen": 109051904,
"step": 52
},
{
"epoch": 0.47801578354002255,
"grad_norm": 0.06253138929605484,
"learning_rate": 2.6427220271569203e-05,
"loss": 0.6546,
"num_input_tokens_seen": 111149056,
"step": 53
},
{
"epoch": 0.48703494926719276,
"grad_norm": 0.06724034994840622,
"learning_rate": 2.5713901269842404e-05,
"loss": 0.6705,
"num_input_tokens_seen": 113246208,
"step": 54
},
{
"epoch": 0.496054114994363,
"grad_norm": 0.06660095602273941,
"learning_rate": 2.5e-05,
"loss": 0.6463,
"num_input_tokens_seen": 115343360,
"step": 55
},
{
"epoch": 0.5050732807215332,
"grad_norm": 0.06747590005397797,
"learning_rate": 2.42860987301576e-05,
"loss": 0.6505,
"num_input_tokens_seen": 117440512,
"step": 56
},
{
"epoch": 0.5140924464487034,
"grad_norm": 0.07006718963384628,
"learning_rate": 2.35727797284308e-05,
"loss": 0.6524,
"num_input_tokens_seen": 119537664,
"step": 57
},
{
"epoch": 0.5231116121758738,
"grad_norm": 0.06662806868553162,
"learning_rate": 2.2860624788029013e-05,
"loss": 0.6559,
"num_input_tokens_seen": 121634816,
"step": 58
},
{
"epoch": 0.532130777903044,
"grad_norm": 0.06757567077875137,
"learning_rate": 2.2150214752736488e-05,
"loss": 0.6511,
"num_input_tokens_seen": 123731968,
"step": 59
},
{
"epoch": 0.5411499436302142,
"grad_norm": 0.07193508744239807,
"learning_rate": 2.1442129043167874e-05,
"loss": 0.6457,
"num_input_tokens_seen": 125829120,
"step": 60
},
{
"epoch": 0.5501691093573844,
"grad_norm": 0.06620261073112488,
"learning_rate": 2.0736945184184405e-05,
"loss": 0.6492,
"num_input_tokens_seen": 127926272,
"step": 61
},
{
"epoch": 0.5591882750845547,
"grad_norm": 0.06846100836992264,
"learning_rate": 2.003523833385637e-05,
"loss": 0.6539,
"num_input_tokens_seen": 130023424,
"step": 62
},
{
"epoch": 0.5682074408117249,
"grad_norm": 0.06885959208011627,
"learning_rate": 1.9337580814355888e-05,
"loss": 0.6417,
"num_input_tokens_seen": 132120576,
"step": 63
},
{
"epoch": 0.5772266065388951,
"grad_norm": 0.06715461611747742,
"learning_rate": 1.8644541645162834e-05,
"loss": 0.6663,
"num_input_tokens_seen": 134217728,
"step": 64
},
{
"epoch": 0.5862457722660653,
"grad_norm": 0.06593496352434158,
"learning_rate": 1.795668607896426e-05,
"loss": 0.6572,
"num_input_tokens_seen": 136314880,
"step": 65
},
{
"epoch": 0.5952649379932357,
"grad_norm": 0.06741371005773544,
"learning_rate": 1.7274575140626318e-05,
"loss": 0.6825,
"num_input_tokens_seen": 138412032,
"step": 66
},
{
"epoch": 0.6042841037204059,
"grad_norm": 0.06627509742975235,
"learning_rate": 1.6598765169614243e-05,
"loss": 0.6509,
"num_input_tokens_seen": 140509184,
"step": 67
},
{
"epoch": 0.6133032694475761,
"grad_norm": 0.06725791096687317,
"learning_rate": 1.5929807366233977e-05,
"loss": 0.6619,
"num_input_tokens_seen": 142606336,
"step": 68
},
{
"epoch": 0.6223224351747464,
"grad_norm": 0.06500604748725891,
"learning_rate": 1.5268247342065215e-05,
"loss": 0.6759,
"num_input_tokens_seen": 144703488,
"step": 69
},
{
"epoch": 0.6313416009019166,
"grad_norm": 0.07281672209501266,
"learning_rate": 1.4614624674952842e-05,
"loss": 0.6568,
"num_input_tokens_seen": 146800640,
"step": 70
},
{
"epoch": 0.6403607666290868,
"grad_norm": 0.0661671832203865,
"learning_rate": 1.3969472468919461e-05,
"loss": 0.6472,
"num_input_tokens_seen": 148897792,
"step": 71
},
{
"epoch": 0.649379932356257,
"grad_norm": 0.06678120791912079,
"learning_rate": 1.3333316919358157e-05,
"loss": 0.6473,
"num_input_tokens_seen": 150994944,
"step": 72
},
{
"epoch": 0.6583990980834273,
"grad_norm": 0.06216076388955116,
"learning_rate": 1.2706676883859903e-05,
"loss": 0.6485,
"num_input_tokens_seen": 153092096,
"step": 73
},
{
"epoch": 0.6674182638105975,
"grad_norm": 0.06877604126930237,
"learning_rate": 1.2090063459025955e-05,
"loss": 0.6544,
"num_input_tokens_seen": 155189248,
"step": 74
},
{
"epoch": 0.6764374295377678,
"grad_norm": 0.06808489561080933,
"learning_rate": 1.148397956361007e-05,
"loss": 0.6763,
"num_input_tokens_seen": 157286400,
"step": 75
},
{
"epoch": 0.685456595264938,
"grad_norm": 0.06282905489206314,
"learning_rate": 1.0888919528330777e-05,
"loss": 0.6406,
"num_input_tokens_seen": 159383552,
"step": 76
},
{
"epoch": 0.6944757609921083,
"grad_norm": 0.06371884793043137,
"learning_rate": 1.0305368692688174e-05,
"loss": 0.6502,
"num_input_tokens_seen": 161480704,
"step": 77
},
{
"epoch": 0.7034949267192785,
"grad_norm": 0.06734833121299744,
"learning_rate": 9.733803009114045e-06,
"loss": 0.6495,
"num_input_tokens_seen": 163577856,
"step": 78
},
{
"epoch": 0.7125140924464487,
"grad_norm": 0.06442791223526001,
"learning_rate": 9.174688654778243e-06,
"loss": 0.6469,
"num_input_tokens_seen": 165675008,
"step": 79
},
{
"epoch": 0.7215332581736189,
"grad_norm": 0.06670290976762772,
"learning_rate": 8.628481651367876e-06,
"loss": 0.6642,
"num_input_tokens_seen": 167772160,
"step": 80
},
{
"epoch": 0.7305524239007892,
"grad_norm": 0.06524420529603958,
"learning_rate": 8.09562749314952e-06,
"loss": 0.6598,
"num_input_tokens_seen": 169869312,
"step": 81
},
{
"epoch": 0.7395715896279594,
"grad_norm": 0.06147584691643715,
"learning_rate": 7.576560783617668e-06,
"loss": 0.6461,
"num_input_tokens_seen": 171966464,
"step": 82
},
{
"epoch": 0.7485907553551296,
"grad_norm": 0.06680367887020111,
"learning_rate": 7.071704881025915e-06,
"loss": 0.6706,
"num_input_tokens_seen": 174063616,
"step": 83
},
{
"epoch": 0.7576099210822999,
"grad_norm": 0.0657731294631958,
"learning_rate": 6.5814715530898745e-06,
"loss": 0.6768,
"num_input_tokens_seen": 176160768,
"step": 84
},
{
"epoch": 0.7666290868094702,
"grad_norm": 0.06921912729740143,
"learning_rate": 6.106260641143546e-06,
"loss": 0.6446,
"num_input_tokens_seen": 178257920,
"step": 85
},
{
"epoch": 0.7756482525366404,
"grad_norm": 0.06477085500955582,
"learning_rate": 5.646459734022938e-06,
"loss": 0.6393,
"num_input_tokens_seen": 180355072,
"step": 86
},
{
"epoch": 0.7846674182638106,
"grad_norm": 0.06648603081703186,
"learning_rate": 5.202443851943126e-06,
"loss": 0.6585,
"num_input_tokens_seen": 182452224,
"step": 87
},
{
"epoch": 0.7936865839909808,
"grad_norm": 0.06899042427539825,
"learning_rate": 4.7745751406263165e-06,
"loss": 0.6393,
"num_input_tokens_seen": 184549376,
"step": 88
},
{
"epoch": 0.8027057497181511,
"grad_norm": 0.06467007100582123,
"learning_rate": 4.36320257593065e-06,
"loss": 0.6613,
"num_input_tokens_seen": 186646528,
"step": 89
},
{
"epoch": 0.8117249154453213,
"grad_norm": 0.06616765260696411,
"learning_rate": 3.968661679220468e-06,
"loss": 0.6631,
"num_input_tokens_seen": 188743680,
"step": 90
},
{
"epoch": 0.8207440811724915,
"grad_norm": 0.06439518183469772,
"learning_rate": 3.591274243710277e-06,
"loss": 0.6356,
"num_input_tokens_seen": 190840832,
"step": 91
},
{
"epoch": 0.8297632468996617,
"grad_norm": 0.06294091790914536,
"learning_rate": 3.2313480720055745e-06,
"loss": 0.6475,
"num_input_tokens_seen": 192937984,
"step": 92
},
{
"epoch": 0.8387824126268321,
"grad_norm": 0.06395729631185532,
"learning_rate": 2.889176725054643e-06,
"loss": 0.6387,
"num_input_tokens_seen": 195035136,
"step": 93
},
{
"epoch": 0.8478015783540023,
"grad_norm": 0.06324164569377899,
"learning_rate": 2.565039282716045e-06,
"loss": 0.6533,
"num_input_tokens_seen": 197132288,
"step": 94
},
{
"epoch": 0.8568207440811725,
"grad_norm": 0.06285514682531357,
"learning_rate": 2.2592001161370392e-06,
"loss": 0.6606,
"num_input_tokens_seen": 199229440,
"step": 95
},
{
"epoch": 0.8658399098083427,
"grad_norm": 0.06468215584754944,
"learning_rate": 1.97190867212875e-06,
"loss": 0.6667,
"num_input_tokens_seen": 201326592,
"step": 96
},
{
"epoch": 0.874859075535513,
"grad_norm": 0.06431297212839127,
"learning_rate": 1.703399269713693e-06,
"loss": 0.6599,
"num_input_tokens_seen": 203423744,
"step": 97
},
{
"epoch": 0.8838782412626832,
"grad_norm": 0.06363896280527115,
"learning_rate": 1.4538909090118846e-06,
"loss": 0.6499,
"num_input_tokens_seen": 205520896,
"step": 98
},
{
"epoch": 0.8928974069898534,
"grad_norm": 0.06607792526483536,
"learning_rate": 1.2235870926211619e-06,
"loss": 0.649,
"num_input_tokens_seen": 207618048,
"step": 99
},
{
"epoch": 0.9019165727170236,
"grad_norm": 0.06387382745742798,
"learning_rate": 1.0126756596375686e-06,
"loss": 0.6551,
"num_input_tokens_seen": 209715200,
"step": 100
},
{
"epoch": 0.910935738444194,
"grad_norm": 0.07034407556056976,
"learning_rate": 8.213286324510738e-07,
"loss": 0.668,
"num_input_tokens_seen": 211812352,
"step": 101
},
{
"epoch": 0.9199549041713642,
"grad_norm": 0.07087666541337967,
"learning_rate": 6.497020764416633e-07,
"loss": 0.6505,
"num_input_tokens_seen": 213909504,
"step": 102
},
{
"epoch": 0.9289740698985344,
"grad_norm": 0.07003826647996902,
"learning_rate": 4.979359726901639e-07,
"loss": 0.6504,
"num_input_tokens_seen": 216006656,
"step": 103
},
{
"epoch": 0.9379932356257046,
"grad_norm": 0.06608197838068008,
"learning_rate": 3.6615410380767544e-07,
"loss": 0.6504,
"num_input_tokens_seen": 218103808,
"step": 104
},
{
"epoch": 0.9470124013528749,
"grad_norm": 0.0653860792517662,
"learning_rate": 2.544639529766829e-07,
"loss": 0.6491,
"num_input_tokens_seen": 220200960,
"step": 105
},
{
"epoch": 0.9560315670800451,
"grad_norm": 0.06089319288730621,
"learning_rate": 1.6295661628624447e-07,
"loss": 0.6449,
"num_input_tokens_seen": 222298112,
"step": 106
},
{
"epoch": 0.9650507328072153,
"grad_norm": 0.06895752251148224,
"learning_rate": 9.170672843271666e-08,
"loss": 0.6729,
"num_input_tokens_seen": 224395264,
"step": 107
},
{
"epoch": 0.9740698985343855,
"grad_norm": 0.06209622696042061,
"learning_rate": 4.07724018466088e-08,
"loss": 0.6308,
"num_input_tokens_seen": 226492416,
"step": 108
},
{
"epoch": 0.9830890642615558,
"grad_norm": 0.06596114486455917,
"learning_rate": 1.0195179295269252e-08,
"loss": 0.6695,
"num_input_tokens_seen": 228589568,
"step": 109
},
{
"epoch": 0.992108229988726,
"grad_norm": 0.06238327547907829,
"learning_rate": 0.0,
"loss": 0.6467,
"num_input_tokens_seen": 230686720,
"step": 110
},
{
"epoch": 0.992108229988726,
"num_input_tokens_seen": 230686720,
"step": 110,
"total_flos": 9.786587384895242e+18,
"train_loss": 0.6728460962122137,
"train_runtime": 17352.9545,
"train_samples_per_second": 3.271,
"train_steps_per_second": 0.006
}
],
"logging_steps": 1,
"max_steps": 110,
"num_input_tokens_seen": 230686720,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.786587384895242e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}