diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single.txt b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single.txt new file mode 100644 index 00000000..ca83bcd9 --- /dev/null +++ b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single.txt @@ -0,0 +1,375 @@ +08/21/2024 06:24:59 - INFO - llamafactory.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.bfloat16 +08/21/2024 06:24:59 - INFO - llamafactory.data.template - Add pad token: +08/21/2024 06:24:59 - INFO - llamafactory.data.template - Cannot add this chat template to tokenizer. +08/21/2024 06:24:59 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_1M_CN... +training example: +input_ids: +[195, 31106, 10345, 31423, 17156, 4129, 4531, 6244, 26653, 10203, 73, 1732, 19811, 72, 31488, 2837, 14302, 3864, 73, 5, 7810, 3068, 17901, 31135, 31237, 31473, 31143, 966, 2898, 24240, 1737, 6664, 72, 6967, 31365, 2837, 20074, 3413, 31188, 31241, 31530, 72, 31187, 6255, 4011, 10047, 73, 31157, 5, 196, 23593, 31237, 31473, 6244, 26653, 10203, 73, 2] +inputs: + 判断给定的文章是否符合语法规则。如果不符合,请提供修改建议。 +下面是一篇文章的开头: "为了探讨这个主题,本文将提供一系列数据和实例,以证明这一观点。" + 这个开头符合语法规则。 +label_ids: +[2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 23593, 31237, 31473, 6244, 26653, 10203, 73, 2] +labels: + 这个开头符合语法规则。 +08/21/2024 06:26:18 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled. +08/21/2024 06:26:18 - INFO - llamafactory.model.model_utils.attention - Using vanilla attention implementation. +08/21/2024 06:26:18 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32. +08/21/2024 06:26:18 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA +08/21/2024 06:26:18 - INFO - llamafactory.model.model_utils.misc - Found linear modules: up_proj,gate_proj,o_proj,down_proj,W_pack +08/21/2024 06:26:18 - INFO - llamafactory.model.loader - trainable params: 17,891,328 || all params: 7,018,450,944 || trainable%: 0.2549 +{'loss': 1.5586, 'grad_norm': 0.3827691078186035, 'learning_rate': 3e-06, 'epoch': 0.01, 'num_input_tokens_seen': 9920} +{'loss': 1.6295, 'grad_norm': 0.3326924741268158, 'learning_rate': 6e-06, 'epoch': 0.01, 'num_input_tokens_seen': 19376} +{'loss': 1.7438, 'grad_norm': 0.36344507336616516, 'learning_rate': 9e-06, 'epoch': 0.02, 'num_input_tokens_seen': 29488} +{'loss': 1.8413, 'grad_norm': 0.4467258155345917, 'learning_rate': 1.2e-05, 'epoch': 0.02, 'num_input_tokens_seen': 38208} +{'loss': 1.5369, 'grad_norm': 0.40837302803993225, 'learning_rate': 1.5e-05, 'epoch': 0.03, 'num_input_tokens_seen': 44624} +{'loss': 1.77, 'grad_norm': 0.6898334622383118, 'learning_rate': 1.8e-05, 'epoch': 0.03, 'num_input_tokens_seen': 52416} +{'loss': 1.6273, 'grad_norm': 0.5511844158172607, 'learning_rate': 2.1e-05, 'epoch': 0.04, 'num_input_tokens_seen': 60368} +{'loss': 1.9737, 'grad_norm': 0.8902711272239685, 'learning_rate': 2.4e-05, 'epoch': 0.04, 'num_input_tokens_seen': 66784} +{'loss': 1.6169, 'grad_norm': 0.5174709558486938, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.05, 'num_input_tokens_seen': 73344} +{'loss': 1.8011, 'grad_norm': 0.6341367363929749, 'learning_rate': 3e-05, 'epoch': 0.05, 'num_input_tokens_seen': 82032} +{'loss': 1.773, 'grad_norm': 0.43879804015159607, 'learning_rate': 3.3e-05, 'epoch': 0.06, 'num_input_tokens_seen': 89808} +{'loss': 1.6426, 'grad_norm': 0.6926860213279724, 'learning_rate': 3.6e-05, 'epoch': 0.06, 'num_input_tokens_seen': 96080} +{'loss': 1.548, 'grad_norm': 0.8264650106430054, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.07, 'num_input_tokens_seen': 102784} +{'loss': 1.5749, 'grad_norm': 0.47357088327407837, 'learning_rate': 4.2e-05, 'epoch': 0.07, 'num_input_tokens_seen': 111184} +{'loss': 1.7287, 'grad_norm': 0.5448750853538513, 'learning_rate': 4.5e-05, 'epoch': 0.08, 'num_input_tokens_seen': 116784} +{'loss': 1.4529, 'grad_norm': 0.6237770318984985, 'learning_rate': 4.8e-05, 'epoch': 0.09, 'num_input_tokens_seen': 125472} +{'loss': 1.6277, 'grad_norm': 0.68182373046875, 'learning_rate': 5.1000000000000006e-05, 'epoch': 0.09, 'num_input_tokens_seen': 133360} +{'loss': 1.6691, 'grad_norm': 0.7576949000358582, 'learning_rate': 5.4000000000000005e-05, 'epoch': 0.1, 'num_input_tokens_seen': 140336} +{'loss': 1.3956, 'grad_norm': 0.7188912630081177, 'learning_rate': 5.6999999999999996e-05, 'epoch': 0.1, 'num_input_tokens_seen': 148112} +{'loss': 1.545, 'grad_norm': 0.6228360533714294, 'learning_rate': 6e-05, 'epoch': 0.11, 'num_input_tokens_seen': 156480} +{'loss': 1.5088, 'grad_norm': 0.6807077527046204, 'learning_rate': 6.3e-05, 'epoch': 0.11, 'num_input_tokens_seen': 164048} +{'loss': 1.484, 'grad_norm': 0.9484089612960815, 'learning_rate': 6.6e-05, 'epoch': 0.12, 'num_input_tokens_seen': 172192} +{'loss': 1.651, 'grad_norm': 0.6590979695320129, 'learning_rate': 6.9e-05, 'epoch': 0.12, 'num_input_tokens_seen': 180528} +{'loss': 1.4958, 'grad_norm': 1.332999587059021, 'learning_rate': 7.2e-05, 'epoch': 0.13, 'num_input_tokens_seen': 187728} +{'loss': 1.499, 'grad_norm': 0.6886986494064331, 'learning_rate': 7.500000000000001e-05, 'epoch': 0.13, 'num_input_tokens_seen': 194592} +{'loss': 1.3138, 'grad_norm': 0.5862389206886292, 'learning_rate': 7.800000000000001e-05, 'epoch': 0.14, 'num_input_tokens_seen': 202416} +{'loss': 1.2772, 'grad_norm': 0.3789741098880768, 'learning_rate': 8.1e-05, 'epoch': 0.14, 'num_input_tokens_seen': 211328} +{'loss': 1.4917, 'grad_norm': 1.1701974868774414, 'learning_rate': 8.4e-05, 'epoch': 0.15, 'num_input_tokens_seen': 218544} +{'loss': 1.4778, 'grad_norm': 0.4418068528175354, 'learning_rate': 8.7e-05, 'epoch': 0.15, 'num_input_tokens_seen': 227632} +{'loss': 1.2799, 'grad_norm': 0.5291661024093628, 'learning_rate': 9e-05, 'epoch': 0.16, 'num_input_tokens_seen': 235440} +{'loss': 1.4626, 'grad_norm': 0.40466922521591187, 'learning_rate': 9.300000000000001e-05, 'epoch': 0.17, 'num_input_tokens_seen': 244032} +{'loss': 1.5386, 'grad_norm': 0.9963971972465515, 'learning_rate': 9.6e-05, 'epoch': 0.17, 'num_input_tokens_seen': 250480} +{'loss': 1.4591, 'grad_norm': 0.5913766026496887, 'learning_rate': 9.900000000000001e-05, 'epoch': 0.18, 'num_input_tokens_seen': 259600} +{'loss': 1.3403, 'grad_norm': 0.6860774755477905, 'learning_rate': 9.999878153526974e-05, 'epoch': 0.18, 'num_input_tokens_seen': 267216} +{'loss': 1.2782, 'grad_norm': 0.7265311479568481, 'learning_rate': 9.999238475781957e-05, 'epoch': 0.19, 'num_input_tokens_seen': 274624} +{'loss': 1.28, 'grad_norm': 0.7685508728027344, 'learning_rate': 9.998050575201771e-05, 'epoch': 0.19, 'num_input_tokens_seen': 280832} +{'loss': 1.476, 'grad_norm': 0.6996146440505981, 'learning_rate': 9.996314582053106e-05, 'epoch': 0.2, 'num_input_tokens_seen': 288544} +{'loss': 1.4419, 'grad_norm': 0.8047633171081543, 'learning_rate': 9.99403068670717e-05, 'epoch': 0.2, 'num_input_tokens_seen': 296560} +{'loss': 1.3927, 'grad_norm': 0.46957364678382874, 'learning_rate': 9.991199139618827e-05, 'epoch': 0.21, 'num_input_tokens_seen': 303600} +{'loss': 1.3418, 'grad_norm': 0.6611707806587219, 'learning_rate': 9.987820251299122e-05, 'epoch': 0.21, 'num_input_tokens_seen': 310800} +{'loss': 1.2896, 'grad_norm': 1.3713186979293823, 'learning_rate': 9.983894392281237e-05, 'epoch': 0.22, 'num_input_tokens_seen': 317936} +{'loss': 1.4422, 'grad_norm': 0.40376824140548706, 'learning_rate': 9.979421993079852e-05, 'epoch': 0.22, 'num_input_tokens_seen': 325936} +{'loss': 1.319, 'grad_norm': 0.6310911178588867, 'learning_rate': 9.974403544143941e-05, 'epoch': 0.23, 'num_input_tokens_seen': 334592} +{'loss': 1.1798, 'grad_norm': 0.6696099638938904, 'learning_rate': 9.968839595802982e-05, 'epoch': 0.23, 'num_input_tokens_seen': 341200} +{'loss': 1.2938, 'grad_norm': 0.5329192876815796, 'learning_rate': 9.962730758206611e-05, 'epoch': 0.24, 'num_input_tokens_seen': 347888} +{'loss': 1.4067, 'grad_norm': 0.41395628452301025, 'learning_rate': 9.956077701257709e-05, 'epoch': 0.25, 'num_input_tokens_seen': 356864} +{'loss': 1.3325, 'grad_norm': 0.49461355805397034, 'learning_rate': 9.948881154538945e-05, 'epoch': 0.25, 'num_input_tokens_seen': 365440} +{'loss': 1.3159, 'grad_norm': 0.47386232018470764, 'learning_rate': 9.941141907232765e-05, 'epoch': 0.26, 'num_input_tokens_seen': 373312} +{'loss': 1.4941, 'grad_norm': 0.9276071786880493, 'learning_rate': 9.932860808034848e-05, 'epoch': 0.26, 'num_input_tokens_seen': 383552} +{'loss': 1.2139, 'grad_norm': 0.7177005410194397, 'learning_rate': 9.924038765061042e-05, 'epoch': 0.27, 'num_input_tokens_seen': 390928} +{'loss': 1.3313, 'grad_norm': 0.5159232020378113, 'learning_rate': 9.914676745747772e-05, 'epoch': 0.27, 'num_input_tokens_seen': 398224} +{'loss': 1.2673, 'grad_norm': 0.651279866695404, 'learning_rate': 9.904775776745958e-05, 'epoch': 0.28, 'num_input_tokens_seen': 406656} +{'loss': 1.357, 'grad_norm': 0.7800387740135193, 'learning_rate': 9.894336943808426e-05, 'epoch': 0.28, 'num_input_tokens_seen': 413584} +{'loss': 1.2238, 'grad_norm': 0.9104688763618469, 'learning_rate': 9.88336139167084e-05, 'epoch': 0.29, 'num_input_tokens_seen': 420240} +{'loss': 1.2388, 'grad_norm': 0.8791072964668274, 'learning_rate': 9.871850323926177e-05, 'epoch': 0.29, 'num_input_tokens_seen': 426608} +{'loss': 1.3229, 'grad_norm': 0.6322370767593384, 'learning_rate': 9.859805002892732e-05, 'epoch': 0.3, 'num_input_tokens_seen': 434224} +{'loss': 1.387, 'grad_norm': 0.5572423338890076, 'learning_rate': 9.847226749475695e-05, 'epoch': 0.3, 'num_input_tokens_seen': 443808} +{'loss': 1.5095, 'grad_norm': 1.0534051656723022, 'learning_rate': 9.834116943022298e-05, 'epoch': 0.31, 'num_input_tokens_seen': 452080} +{'loss': 1.2474, 'grad_norm': 0.581840991973877, 'learning_rate': 9.820477021170551e-05, 'epoch': 0.31, 'num_input_tokens_seen': 460528} +{'loss': 1.2774, 'grad_norm': 0.6569491028785706, 'learning_rate': 9.806308479691595e-05, 'epoch': 0.32, 'num_input_tokens_seen': 469856} +{'loss': 1.2836, 'grad_norm': 0.6684996485710144, 'learning_rate': 9.791612872325667e-05, 'epoch': 0.33, 'num_input_tokens_seen': 477216} +{'loss': 1.4973, 'grad_norm': 0.5214691758155823, 'learning_rate': 9.776391810611718e-05, 'epoch': 0.33, 'num_input_tokens_seen': 485392} +{'loss': 1.2715, 'grad_norm': 0.60368812084198, 'learning_rate': 9.760646963710694e-05, 'epoch': 0.34, 'num_input_tokens_seen': 492832} +{'loss': 1.303, 'grad_norm': 1.012483835220337, 'learning_rate': 9.744380058222483e-05, 'epoch': 0.34, 'num_input_tokens_seen': 500112} +{'loss': 1.3055, 'grad_norm': 0.5348647236824036, 'learning_rate': 9.727592877996585e-05, 'epoch': 0.35, 'num_input_tokens_seen': 508384} +{'loss': 1.551, 'grad_norm': 0.6294691562652588, 'learning_rate': 9.710287263936484e-05, 'epoch': 0.35, 'num_input_tokens_seen': 516576} +{'loss': 1.4759, 'grad_norm': 0.6876276731491089, 'learning_rate': 9.69246511379778e-05, 'epoch': 0.36, 'num_input_tokens_seen': 523456} +{'loss': 1.358, 'grad_norm': 0.6463469862937927, 'learning_rate': 9.674128381980072e-05, 'epoch': 0.36, 'num_input_tokens_seen': 530768} +{'loss': 1.2757, 'grad_norm': 0.41761478781700134, 'learning_rate': 9.655279079312642e-05, 'epoch': 0.37, 'num_input_tokens_seen': 538288} +{'loss': 1.2448, 'grad_norm': 0.6983274817466736, 'learning_rate': 9.635919272833938e-05, 'epoch': 0.37, 'num_input_tokens_seen': 546816} +{'loss': 1.3256, 'grad_norm': 0.49796414375305176, 'learning_rate': 9.616051085564906e-05, 'epoch': 0.38, 'num_input_tokens_seen': 554800} +{'loss': 1.254, 'grad_norm': 0.9893045425415039, 'learning_rate': 9.595676696276172e-05, 'epoch': 0.38, 'num_input_tokens_seen': 560368} +{'loss': 1.3174, 'grad_norm': 1.2111352682113647, 'learning_rate': 9.574798339249125e-05, 'epoch': 0.39, 'num_input_tokens_seen': 567504} +{'loss': 1.1809, 'grad_norm': 0.6263272166252136, 'learning_rate': 9.553418304030886e-05, 'epoch': 0.39, 'num_input_tokens_seen': 575440} +{'loss': 1.3219, 'grad_norm': 0.9461821913719177, 'learning_rate': 9.53153893518325e-05, 'epoch': 0.4, 'num_input_tokens_seen': 583024} +{'loss': 1.4601, 'grad_norm': 0.5056630969047546, 'learning_rate': 9.50916263202557e-05, 'epoch': 0.41, 'num_input_tokens_seen': 591840} +{'loss': 1.3049, 'grad_norm': 0.4600001573562622, 'learning_rate': 9.486291848371643e-05, 'epoch': 0.41, 'num_input_tokens_seen': 599392} +{'loss': 1.4101, 'grad_norm': 0.587371826171875, 'learning_rate': 9.462929092260628e-05, 'epoch': 0.42, 'num_input_tokens_seen': 606560} +{'loss': 1.1738, 'grad_norm': 0.5228914022445679, 'learning_rate': 9.439076925682006e-05, 'epoch': 0.42, 'num_input_tokens_seen': 615216} +{'loss': 1.3717, 'grad_norm': 0.5150928497314453, 'learning_rate': 9.414737964294636e-05, 'epoch': 0.43, 'num_input_tokens_seen': 623696} +{'loss': 1.3903, 'grad_norm': 0.4762982130050659, 'learning_rate': 9.389914877139903e-05, 'epoch': 0.43, 'num_input_tokens_seen': 631568} +{'loss': 1.3946, 'grad_norm': 0.43511807918548584, 'learning_rate': 9.364610386349049e-05, 'epoch': 0.44, 'num_input_tokens_seen': 639424} +{'loss': 1.22, 'grad_norm': 0.6298917531967163, 'learning_rate': 9.338827266844644e-05, 'epoch': 0.44, 'num_input_tokens_seen': 647696} +{'loss': 1.3776, 'grad_norm': 0.5033426880836487, 'learning_rate': 9.312568346036288e-05, 'epoch': 0.45, 'num_input_tokens_seen': 655600} +{'loss': 1.3568, 'grad_norm': 0.639208972454071, 'learning_rate': 9.285836503510562e-05, 'epoch': 0.45, 'num_input_tokens_seen': 663200} +{'loss': 1.2346, 'grad_norm': 0.7679170966148376, 'learning_rate': 9.258634670715238e-05, 'epoch': 0.46, 'num_input_tokens_seen': 670256} +{'loss': 1.4332, 'grad_norm': 0.5441117286682129, 'learning_rate': 9.230965830637821e-05, 'epoch': 0.46, 'num_input_tokens_seen': 678304} +{'loss': 1.3245, 'grad_norm': 0.638019323348999, 'learning_rate': 9.202833017478422e-05, 'epoch': 0.47, 'num_input_tokens_seen': 687392} +{'loss': 1.3583, 'grad_norm': 0.41635143756866455, 'learning_rate': 9.174239316317033e-05, 'epoch': 0.47, 'num_input_tokens_seen': 695344} +{'loss': 1.271, 'grad_norm': 1.1276559829711914, 'learning_rate': 9.145187862775209e-05, 'epoch': 0.48, 'num_input_tokens_seen': 704736} +{'loss': 1.207, 'grad_norm': 0.78920978307724, 'learning_rate': 9.11568184267221e-05, 'epoch': 0.49, 'num_input_tokens_seen': 710720} +{'loss': 1.1755, 'grad_norm': 0.36885419487953186, 'learning_rate': 9.085724491675642e-05, 'epoch': 0.49, 'num_input_tokens_seen': 718128} +{'loss': 1.2883, 'grad_norm': 0.7129054069519043, 'learning_rate': 9.055319094946633e-05, 'epoch': 0.5, 'num_input_tokens_seen': 725568} +{'loss': 1.314, 'grad_norm': 0.8274949789047241, 'learning_rate': 9.02446898677957e-05, 'epoch': 0.5, 'num_input_tokens_seen': 732928} +{'loss': 1.2864, 'grad_norm': 0.6434007883071899, 'learning_rate': 8.993177550236464e-05, 'epoch': 0.51, 'num_input_tokens_seen': 741536} +{'loss': 1.227, 'grad_norm': 0.5015966296195984, 'learning_rate': 8.961448216775954e-05, 'epoch': 0.51, 'num_input_tokens_seen': 748288} +{'loss': 1.3016, 'grad_norm': 0.4610384702682495, 'learning_rate': 8.92928446587701e-05, 'epoch': 0.52, 'num_input_tokens_seen': 757216} +{'loss': 1.2786, 'grad_norm': 0.7528018355369568, 'learning_rate': 8.896689824657372e-05, 'epoch': 0.52, 'num_input_tokens_seen': 764240} +{'loss': 1.3358, 'grad_norm': 0.5625883340835571, 'learning_rate': 8.863667867486756e-05, 'epoch': 0.53, 'num_input_tokens_seen': 774416} +{'loss': 1.3911, 'grad_norm': 0.720058798789978, 'learning_rate': 8.83022221559489e-05, 'epoch': 0.53, 'num_input_tokens_seen': 782656} +{'loss': 1.193, 'grad_norm': 0.4910522997379303, 'learning_rate': 8.796356536674403e-05, 'epoch': 0.54, 'num_input_tokens_seen': 790032} +{'loss': 1.2034, 'grad_norm': 0.6103739142417908, 'learning_rate': 8.762074544478623e-05, 'epoch': 0.54, 'num_input_tokens_seen': 798048} +{'loss': 1.1278, 'grad_norm': 0.5440624356269836, 'learning_rate': 8.727379998414311e-05, 'epoch': 0.55, 'num_input_tokens_seen': 805792} +{'loss': 1.3952, 'grad_norm': 0.7663698792457581, 'learning_rate': 8.692276703129421e-05, 'epoch': 0.55, 'num_input_tokens_seen': 813280} +{'loss': 1.3797, 'grad_norm': 0.45255619287490845, 'learning_rate': 8.656768508095853e-05, 'epoch': 0.56, 'num_input_tokens_seen': 822464} +{'loss': 1.311, 'grad_norm': 0.5178409218788147, 'learning_rate': 8.620859307187339e-05, 'epoch': 0.57, 'num_input_tokens_seen': 832672} +{'loss': 1.3417, 'grad_norm': 0.44465571641921997, 'learning_rate': 8.584553038252414e-05, 'epoch': 0.57, 'num_input_tokens_seen': 841248} +{'loss': 1.2492, 'grad_norm': 0.5521979331970215, 'learning_rate': 8.547853682682604e-05, 'epoch': 0.58, 'num_input_tokens_seen': 848432} +{'loss': 1.4109, 'grad_norm': 0.5571572780609131, 'learning_rate': 8.510765264975813e-05, 'epoch': 0.58, 'num_input_tokens_seen': 858096} +{'loss': 1.3382, 'grad_norm': 0.389121949672699, 'learning_rate': 8.473291852294987e-05, 'epoch': 0.59, 'num_input_tokens_seen': 866784} +{'loss': 1.2248, 'grad_norm': 0.491073876619339, 'learning_rate': 8.435437554022115e-05, 'epoch': 0.59, 'num_input_tokens_seen': 873904} +{'loss': 1.1435, 'grad_norm': 0.7570438385009766, 'learning_rate': 8.397206521307584e-05, 'epoch': 0.6, 'num_input_tokens_seen': 879856} +{'loss': 1.1193, 'grad_norm': 0.5216025114059448, 'learning_rate': 8.358602946614951e-05, 'epoch': 0.6, 'num_input_tokens_seen': 887152} +{'loss': 1.2794, 'grad_norm': 0.47078531980514526, 'learning_rate': 8.319631063261209e-05, 'epoch': 0.61, 'num_input_tokens_seen': 896752} +{'loss': 1.2186, 'grad_norm': 0.5252036452293396, 'learning_rate': 8.280295144952536e-05, 'epoch': 0.61, 'num_input_tokens_seen': 905696} +{'loss': 1.2683, 'grad_norm': 0.5388907790184021, 'learning_rate': 8.240599505315655e-05, 'epoch': 0.62, 'num_input_tokens_seen': 913536} +{'loss': 1.1247, 'grad_norm': 0.5209746360778809, 'learning_rate': 8.200548497424778e-05, 'epoch': 0.62, 'num_input_tokens_seen': 919584} +{'loss': 1.4098, 'grad_norm': 0.4642646014690399, 'learning_rate': 8.160146513324254e-05, 'epoch': 0.63, 'num_input_tokens_seen': 927248} +{'loss': 1.2803, 'grad_norm': 0.501924991607666, 'learning_rate': 8.119397983546932e-05, 'epoch': 0.63, 'num_input_tokens_seen': 937328} +{'loss': 1.4474, 'grad_norm': 0.39491286873817444, 'learning_rate': 8.07830737662829e-05, 'epoch': 0.64, 'num_input_tokens_seen': 946400} +{'loss': 1.3626, 'grad_norm': 1.1283897161483765, 'learning_rate': 8.036879198616434e-05, 'epoch': 0.65, 'num_input_tokens_seen': 954912} +{'loss': 1.2167, 'grad_norm': 0.5616379976272583, 'learning_rate': 7.99511799257793e-05, 'epoch': 0.65, 'num_input_tokens_seen': 963056} +{'loss': 1.2025, 'grad_norm': 0.7390901446342468, 'learning_rate': 7.953028338099627e-05, 'epoch': 0.66, 'num_input_tokens_seen': 970048} +{'loss': 1.313, 'grad_norm': 0.9030864238739014, 'learning_rate': 7.910614850786448e-05, 'epoch': 0.66, 'num_input_tokens_seen': 977312} +{'loss': 1.479, 'grad_norm': 0.61843341588974, 'learning_rate': 7.86788218175523e-05, 'epoch': 0.67, 'num_input_tokens_seen': 984576} +{'loss': 1.435, 'grad_norm': 0.5310498476028442, 'learning_rate': 7.82483501712469e-05, 'epoch': 0.67, 'num_input_tokens_seen': 992464} +{'loss': 1.1344, 'grad_norm': 0.5555992126464844, 'learning_rate': 7.781478077501525e-05, 'epoch': 0.68, 'num_input_tokens_seen': 1000400} +{'loss': 1.2989, 'grad_norm': 0.4795224070549011, 'learning_rate': 7.737816117462752e-05, 'epoch': 0.68, 'num_input_tokens_seen': 1007952} +{'loss': 1.0671, 'grad_norm': 0.6490950584411621, 'learning_rate': 7.693853925034315e-05, 'epoch': 0.69, 'num_input_tokens_seen': 1014064} +{'loss': 1.2786, 'grad_norm': 1.0598840713500977, 'learning_rate': 7.649596321166024e-05, 'epoch': 0.69, 'num_input_tokens_seen': 1021472} +{'loss': 1.4096, 'grad_norm': 0.5905397534370422, 'learning_rate': 7.605048159202883e-05, 'epoch': 0.7, 'num_input_tokens_seen': 1028848} +{'loss': 1.1785, 'grad_norm': 0.6974928975105286, 'learning_rate': 7.560214324352858e-05, 'epoch': 0.7, 'num_input_tokens_seen': 1037760} +{'loss': 1.2757, 'grad_norm': 0.6378306150436401, 'learning_rate': 7.515099733151177e-05, 'epoch': 0.71, 'num_input_tokens_seen': 1045472} +{'loss': 1.2499, 'grad_norm': 0.6533056497573853, 'learning_rate': 7.469709332921155e-05, 'epoch': 0.71, 'num_input_tokens_seen': 1053024} +{'loss': 1.2605, 'grad_norm': 0.762304961681366, 'learning_rate': 7.424048101231686e-05, 'epoch': 0.72, 'num_input_tokens_seen': 1060336} +{'loss': 1.2975, 'grad_norm': 0.5734354853630066, 'learning_rate': 7.378121045351378e-05, 'epoch': 0.73, 'num_input_tokens_seen': 1067184} +{'loss': 1.1854, 'grad_norm': 0.7655225396156311, 'learning_rate': 7.331933201699457e-05, 'epoch': 0.73, 'num_input_tokens_seen': 1074768} +{'loss': 1.1631, 'grad_norm': 0.7301100492477417, 'learning_rate': 7.285489635293472e-05, 'epoch': 0.74, 'num_input_tokens_seen': 1083360} +{'loss': 1.2159, 'grad_norm': 0.6851321458816528, 'learning_rate': 7.238795439193848e-05, 'epoch': 0.74, 'num_input_tokens_seen': 1091200} +{'loss': 1.201, 'grad_norm': 0.513536810874939, 'learning_rate': 7.191855733945387e-05, 'epoch': 0.75, 'num_input_tokens_seen': 1100976} +{'loss': 1.3129, 'grad_norm': 0.9055917263031006, 'learning_rate': 7.14467566701573e-05, 'epoch': 0.75, 'num_input_tokens_seen': 1109264} +{'loss': 1.5195, 'grad_norm': 0.5442625880241394, 'learning_rate': 7.097260412230886e-05, 'epoch': 0.76, 'num_input_tokens_seen': 1117568} +{'loss': 1.3843, 'grad_norm': 0.48822855949401855, 'learning_rate': 7.049615169207864e-05, 'epoch': 0.76, 'num_input_tokens_seen': 1126416} +{'loss': 1.1864, 'grad_norm': 0.4853934645652771, 'learning_rate': 7.001745162784477e-05, 'epoch': 0.77, 'num_input_tokens_seen': 1133984} +{'loss': 1.4373, 'grad_norm': 0.7485060095787048, 'learning_rate': 6.953655642446368e-05, 'epoch': 0.77, 'num_input_tokens_seen': 1142608} +{'loss': 1.2651, 'grad_norm': 0.6429968476295471, 'learning_rate': 6.905351881751372e-05, 'epoch': 0.78, 'num_input_tokens_seen': 1152304} +{'loss': 1.2864, 'grad_norm': 0.6068575978279114, 'learning_rate': 6.856839177751176e-05, 'epoch': 0.78, 'num_input_tokens_seen': 1161072} +{'loss': 1.1525, 'grad_norm': 0.697914719581604, 'learning_rate': 6.808122850410461e-05, 'epoch': 0.79, 'num_input_tokens_seen': 1168480} +{'loss': 1.3601, 'grad_norm': 0.8854771852493286, 'learning_rate': 6.759208242023509e-05, 'epoch': 0.79, 'num_input_tokens_seen': 1175392} +{'loss': 1.4018, 'grad_norm': 0.5190437436103821, 'learning_rate': 6.710100716628344e-05, 'epoch': 0.8, 'num_input_tokens_seen': 1184544} +{'loss': 1.2425, 'grad_norm': 0.6468976736068726, 'learning_rate': 6.660805659418516e-05, 'epoch': 0.81, 'num_input_tokens_seen': 1192880} +{'loss': 1.2589, 'grad_norm': 0.5303670763969421, 'learning_rate': 6.611328476152557e-05, 'epoch': 0.81, 'num_input_tokens_seen': 1200928} +{'loss': 1.3289, 'grad_norm': 0.7813239097595215, 'learning_rate': 6.561674592561163e-05, 'epoch': 0.82, 'num_input_tokens_seen': 1209056} +{'loss': 1.2851, 'grad_norm': 0.4044243395328522, 'learning_rate': 6.511849453752223e-05, 'epoch': 0.82, 'num_input_tokens_seen': 1217040} +{'loss': 1.2584, 'grad_norm': 0.6317430734634399, 'learning_rate': 6.461858523613684e-05, 'epoch': 0.83, 'num_input_tokens_seen': 1223712} +{'loss': 1.2729, 'grad_norm': 0.5451323986053467, 'learning_rate': 6.411707284214384e-05, 'epoch': 0.83, 'num_input_tokens_seen': 1230736} +{'loss': 1.3603, 'grad_norm': 0.5434067249298096, 'learning_rate': 6.361401235202872e-05, 'epoch': 0.84, 'num_input_tokens_seen': 1237728} +{'loss': 1.1509, 'grad_norm': 0.41243186593055725, 'learning_rate': 6.310945893204324e-05, 'epoch': 0.84, 'num_input_tokens_seen': 1247568} +{'loss': 1.3986, 'grad_norm': 0.9249187111854553, 'learning_rate': 6.26034679121557e-05, 'epoch': 0.85, 'num_input_tokens_seen': 1255968} +{'loss': 1.3266, 'grad_norm': 0.5903889536857605, 'learning_rate': 6.209609477998338e-05, 'epoch': 0.85, 'num_input_tokens_seen': 1262640} +{'loss': 1.5078, 'grad_norm': 0.880121111869812, 'learning_rate': 6.158739517470786e-05, 'epoch': 0.86, 'num_input_tokens_seen': 1270464} +{'loss': 1.0109, 'grad_norm': 0.8485903143882751, 'learning_rate': 6.107742488097338e-05, 'epoch': 0.86, 'num_input_tokens_seen': 1278400} +{'loss': 1.3723, 'grad_norm': 0.8262659311294556, 'learning_rate': 6.056623982276944e-05, 'epoch': 0.87, 'num_input_tokens_seen': 1287072} +{'loss': 1.3698, 'grad_norm': 0.9479841589927673, 'learning_rate': 6.005389605729824e-05, 'epoch': 0.87, 'num_input_tokens_seen': 1294784} +{'loss': 1.2799, 'grad_norm': 0.8312945365905762, 'learning_rate': 5.9540449768827246e-05, 'epoch': 0.88, 'num_input_tokens_seen': 1303872} +{'loss': 1.1711, 'grad_norm': 0.8914958238601685, 'learning_rate': 5.902595726252801e-05, 'epoch': 0.89, 'num_input_tokens_seen': 1309888} +{'eval_loss': 1.2698992490768433, 'eval_runtime': 23.2641, 'eval_samples_per_second': 42.985, 'eval_steps_per_second': 21.492, 'epoch': 0.89, 'num_input_tokens_seen': 1314896} +{'loss': 1.3553, 'grad_norm': 0.839598536491394, 'learning_rate': 5.851047495830163e-05, 'epoch': 0.89, 'num_input_tokens_seen': 1317600} +{'loss': 1.4642, 'grad_norm': 0.6949290037155151, 'learning_rate': 5.799405938459175e-05, 'epoch': 0.9, 'num_input_tokens_seen': 1326576} +{'loss': 1.1863, 'grad_norm': 0.594464898109436, 'learning_rate': 5.747676717218549e-05, 'epoch': 0.9, 'num_input_tokens_seen': 1332944} +{'loss': 1.0854, 'grad_norm': 0.502791166305542, 'learning_rate': 5.695865504800327e-05, 'epoch': 0.91, 'num_input_tokens_seen': 1340672} +{'loss': 1.1361, 'grad_norm': 0.8494873046875, 'learning_rate': 5.643977982887815e-05, 'epoch': 0.91, 'num_input_tokens_seen': 1347408} +{'loss': 1.5376, 'grad_norm': 0.5948024392127991, 'learning_rate': 5.5920198415325064e-05, 'epoch': 0.92, 'num_input_tokens_seen': 1356096} +{'loss': 1.3248, 'grad_norm': 0.8378577828407288, 'learning_rate': 5.539996778530115e-05, 'epoch': 0.92, 'num_input_tokens_seen': 1364832} +{'loss': 1.2347, 'grad_norm': 0.7156303524971008, 'learning_rate': 5.487914498795747e-05, 'epoch': 0.93, 'num_input_tokens_seen': 1371520} +{'loss': 1.2789, 'grad_norm': 0.6701870560646057, 'learning_rate': 5.435778713738292e-05, 'epoch': 0.93, 'num_input_tokens_seen': 1382112} +{'loss': 1.2137, 'grad_norm': 0.6556785106658936, 'learning_rate': 5.383595140634093e-05, 'epoch': 0.94, 'num_input_tokens_seen': 1390880} +{'loss': 1.1884, 'grad_norm': 0.5082133412361145, 'learning_rate': 5.3313695020000024e-05, 'epoch': 0.94, 'num_input_tokens_seen': 1399248} +{'loss': 1.0717, 'grad_norm': 0.8579297065734863, 'learning_rate': 5.279107524965819e-05, 'epoch': 0.95, 'num_input_tokens_seen': 1407344} +{'loss': 1.341, 'grad_norm': 0.7548760771751404, 'learning_rate': 5.226814940646269e-05, 'epoch': 0.95, 'num_input_tokens_seen': 1414864} +{'loss': 1.2898, 'grad_norm': 0.42420780658721924, 'learning_rate': 5.174497483512506e-05, 'epoch': 0.96, 'num_input_tokens_seen': 1424144} +{'loss': 1.226, 'grad_norm': 0.6996252536773682, 'learning_rate': 5.1221608907632665e-05, 'epoch': 0.97, 'num_input_tokens_seen': 1431120} +{'loss': 1.3647, 'grad_norm': 0.8375110030174255, 'learning_rate': 5.0698109016957274e-05, 'epoch': 0.97, 'num_input_tokens_seen': 1439056} +{'loss': 1.168, 'grad_norm': 0.4961775541305542, 'learning_rate': 5.017453257076119e-05, 'epoch': 0.98, 'num_input_tokens_seen': 1447184} +{'loss': 1.3466, 'grad_norm': 0.7079893350601196, 'learning_rate': 4.965093698510193e-05, 'epoch': 0.98, 'num_input_tokens_seen': 1454992} +{'loss': 1.1918, 'grad_norm': 0.7946033477783203, 'learning_rate': 4.912737967813583e-05, 'epoch': 0.99, 'num_input_tokens_seen': 1462048} +{'loss': 1.2577, 'grad_norm': 0.6242976784706116, 'learning_rate': 4.860391806382157e-05, 'epoch': 0.99, 'num_input_tokens_seen': 1469024} +{'loss': 1.2251, 'grad_norm': 0.681969940662384, 'learning_rate': 4.8080609545624004e-05, 'epoch': 1.0, 'num_input_tokens_seen': 1476496} +{'loss': 1.3152, 'grad_norm': 0.4314422011375427, 'learning_rate': 4.755751151021934e-05, 'epoch': 1.0, 'num_input_tokens_seen': 1485568} +{'loss': 1.1243, 'grad_norm': 0.7379014492034912, 'learning_rate': 4.703468132120193e-05, 'epoch': 1.01, 'num_input_tokens_seen': 1492544} +{'loss': 1.3484, 'grad_norm': 0.5139598846435547, 'learning_rate': 4.6512176312793736e-05, 'epoch': 1.01, 'num_input_tokens_seen': 1501216} +{'loss': 1.1443, 'grad_norm': 0.735149085521698, 'learning_rate': 4.599005378355706e-05, 'epoch': 1.02, 'num_input_tokens_seen': 1509824} +{'loss': 1.1144, 'grad_norm': 0.5389354228973389, 'learning_rate': 4.5468370990111006e-05, 'epoch': 1.02, 'num_input_tokens_seen': 1516736} +{'loss': 1.4006, 'grad_norm': 0.6230564713478088, 'learning_rate': 4.494718514085268e-05, 'epoch': 1.03, 'num_input_tokens_seen': 1524208} +{'loss': 1.2874, 'grad_norm': 0.5797858834266663, 'learning_rate': 4.442655338968373e-05, 'epoch': 1.03, 'num_input_tokens_seen': 1534656} +{'loss': 1.0906, 'grad_norm': 0.5728005170822144, 'learning_rate': 4.390653282974264e-05, 'epoch': 1.04, 'num_input_tokens_seen': 1541136} +{'loss': 1.1907, 'grad_norm': 0.5849784016609192, 'learning_rate': 4.3387180487143876e-05, 'epoch': 1.05, 'num_input_tokens_seen': 1550640} +{'loss': 1.2323, 'grad_norm': 1.1090894937515259, 'learning_rate': 4.2868553314724425e-05, 'epoch': 1.05, 'num_input_tokens_seen': 1557488} +{'loss': 1.3508, 'grad_norm': 0.5953820943832397, 'learning_rate': 4.23507081857981e-05, 'epoch': 1.06, 'num_input_tokens_seen': 1565392} +{'loss': 1.3322, 'grad_norm': 0.45134949684143066, 'learning_rate': 4.1833701887918904e-05, 'epoch': 1.06, 'num_input_tokens_seen': 1574080} +{'loss': 1.0924, 'grad_norm': 0.9110301733016968, 'learning_rate': 4.131759111665349e-05, 'epoch': 1.07, 'num_input_tokens_seen': 1581312} +{'loss': 1.2264, 'grad_norm': 1.416802167892456, 'learning_rate': 4.080243246936399e-05, 'epoch': 1.07, 'num_input_tokens_seen': 1588096} +{'loss': 1.05, 'grad_norm': 0.8100959062576294, 'learning_rate': 4.028828243900141e-05, 'epoch': 1.08, 'num_input_tokens_seen': 1594736} +{'loss': 1.3512, 'grad_norm': 0.5712919235229492, 'learning_rate': 3.9775197407910485e-05, 'epoch': 1.08, 'num_input_tokens_seen': 1603312} +{'loss': 1.281, 'grad_norm': 0.7171860933303833, 'learning_rate': 3.926323364164684e-05, 'epoch': 1.09, 'num_input_tokens_seen': 1613232} +{'loss': 1.3661, 'grad_norm': 0.633402407169342, 'learning_rate': 3.875244728280676e-05, 'epoch': 1.09, 'num_input_tokens_seen': 1621936} +{'loss': 1.1989, 'grad_norm': 0.6984114050865173, 'learning_rate': 3.82428943448705e-05, 'epoch': 1.1, 'num_input_tokens_seen': 1629984} +{'loss': 1.0763, 'grad_norm': 1.2278004884719849, 'learning_rate': 3.773463070605987e-05, 'epoch': 1.1, 'num_input_tokens_seen': 1636016} +{'loss': 1.1786, 'grad_norm': 0.9503414034843445, 'learning_rate': 3.7227712103210486e-05, 'epoch': 1.11, 'num_input_tokens_seen': 1642832} +{'loss': 1.0718, 'grad_norm': 3.3439748287200928, 'learning_rate': 3.6722194125659556e-05, 'epoch': 1.11, 'num_input_tokens_seen': 1648752} +{'loss': 1.3182, 'grad_norm': 0.8299528360366821, 'learning_rate': 3.6218132209150045e-05, 'epoch': 1.12, 'num_input_tokens_seen': 1657488} +{'loss': 1.3274, 'grad_norm': 0.4776313602924347, 'learning_rate': 3.5715581629751326e-05, 'epoch': 1.13, 'num_input_tokens_seen': 1666720} +{'loss': 1.3109, 'grad_norm': 0.610567033290863, 'learning_rate': 3.5214597497797684e-05, 'epoch': 1.13, 'num_input_tokens_seen': 1678048} +{'loss': 1.287, 'grad_norm': 0.7393361926078796, 'learning_rate': 3.471523475184472e-05, 'epoch': 1.14, 'num_input_tokens_seen': 1687120} +{'loss': 1.2379, 'grad_norm': 0.752165675163269, 'learning_rate': 3.4217548152644885e-05, 'epoch': 1.14, 'num_input_tokens_seen': 1695232} +{'loss': 1.2971, 'grad_norm': 0.7106702327728271, 'learning_rate': 3.372159227714218e-05, 'epoch': 1.15, 'num_input_tokens_seen': 1703424} +{'loss': 1.1837, 'grad_norm': 1.0731079578399658, 'learning_rate': 3.322742151248725e-05, 'epoch': 1.15, 'num_input_tokens_seen': 1711504} +{'loss': 1.1607, 'grad_norm': 0.7986055016517639, 'learning_rate': 3.273509005007327e-05, 'epoch': 1.16, 'num_input_tokens_seen': 1720016} +{'loss': 1.1984, 'grad_norm': 0.8783122897148132, 'learning_rate': 3.224465187959316e-05, 'epoch': 1.16, 'num_input_tokens_seen': 1725952} +{'loss': 1.2528, 'grad_norm': 0.6338449716567993, 'learning_rate': 3.1756160783119016e-05, 'epoch': 1.17, 'num_input_tokens_seen': 1735728} +{'loss': 1.2614, 'grad_norm': 1.1053016185760498, 'learning_rate': 3.12696703292044e-05, 'epoch': 1.17, 'num_input_tokens_seen': 1742160} +{'loss': 1.2315, 'grad_norm': 0.9506245255470276, 'learning_rate': 3.078523386700982e-05, 'epoch': 1.18, 'num_input_tokens_seen': 1748592} +{'loss': 1.1794, 'grad_norm': 0.909778892993927, 'learning_rate': 3.0302904520452447e-05, 'epoch': 1.18, 'num_input_tokens_seen': 1756800} +{'loss': 1.205, 'grad_norm': 0.7909402847290039, 'learning_rate': 2.9822735182380496e-05, 'epoch': 1.19, 'num_input_tokens_seen': 1764624} +{'loss': 1.2932, 'grad_norm': 0.7799058556556702, 'learning_rate': 2.934477850877292e-05, 'epoch': 1.19, 'num_input_tokens_seen': 1772688} +{'loss': 1.2259, 'grad_norm': 0.912695050239563, 'learning_rate': 2.886908691296504e-05, 'epoch': 1.2, 'num_input_tokens_seen': 1780128} +{'loss': 1.2725, 'grad_norm': 0.8391047120094299, 'learning_rate': 2.8395712559900877e-05, 'epoch': 1.21, 'num_input_tokens_seen': 1788160} +{'loss': 1.0984, 'grad_norm': 0.9869920611381531, 'learning_rate': 2.7924707360412746e-05, 'epoch': 1.21, 'num_input_tokens_seen': 1794336} +{'loss': 1.3735, 'grad_norm': 1.0749855041503906, 'learning_rate': 2.7456122965528475e-05, 'epoch': 1.22, 'num_input_tokens_seen': 1801280} +{'loss': 1.2211, 'grad_norm': 0.6031284332275391, 'learning_rate': 2.699001076080742e-05, 'epoch': 1.22, 'num_input_tokens_seen': 1811088} +{'loss': 1.123, 'grad_norm': 0.86882483959198, 'learning_rate': 2.6526421860705473e-05, 'epoch': 1.23, 'num_input_tokens_seen': 1816848} +{'loss': 1.2838, 'grad_norm': 0.6549557447433472, 'learning_rate': 2.6065407102969664e-05, 'epoch': 1.23, 'num_input_tokens_seen': 1824992} +{'loss': 1.1937, 'grad_norm': 0.737458348274231, 'learning_rate': 2.560701704306336e-05, 'epoch': 1.24, 'num_input_tokens_seen': 1831712} +{'loss': 1.2453, 'grad_norm': 0.5440405607223511, 'learning_rate': 2.5151301948622237e-05, 'epoch': 1.24, 'num_input_tokens_seen': 1842272} +{'loss': 1.1175, 'grad_norm': 0.9566605687141418, 'learning_rate': 2.469831179394182e-05, 'epoch': 1.25, 'num_input_tokens_seen': 1847776} +{'loss': 1.2671, 'grad_norm': 0.5786175727844238, 'learning_rate': 2.4248096254497288e-05, 'epoch': 1.25, 'num_input_tokens_seen': 1856992} +{'loss': 1.2001, 'grad_norm': 1.0269118547439575, 'learning_rate': 2.3800704701496053e-05, 'epoch': 1.26, 'num_input_tokens_seen': 1864448} +{'loss': 1.2348, 'grad_norm': 0.783041775226593, 'learning_rate': 2.33561861964635e-05, 'epoch': 1.26, 'num_input_tokens_seen': 1873680} +{'loss': 1.1127, 'grad_norm': 0.8268475532531738, 'learning_rate': 2.2914589485863014e-05, 'epoch': 1.27, 'num_input_tokens_seen': 1881744} +{'loss': 1.1406, 'grad_norm': 0.948457658290863, 'learning_rate': 2.247596299575022e-05, 'epoch': 1.27, 'num_input_tokens_seen': 1889808} +{'loss': 1.1346, 'grad_norm': 0.7794213891029358, 'learning_rate': 2.2040354826462668e-05, 'epoch': 1.28, 'num_input_tokens_seen': 1896304} +{'loss': 1.2673, 'grad_norm': 0.9084338545799255, 'learning_rate': 2.160781274734495e-05, 'epoch': 1.29, 'num_input_tokens_seen': 1906672} +{'loss': 1.1793, 'grad_norm': 0.8108258843421936, 'learning_rate': 2.117838419151034e-05, 'epoch': 1.29, 'num_input_tokens_seen': 1913616} +{'loss': 1.1043, 'grad_norm': 0.8344593644142151, 'learning_rate': 2.0752116250639225e-05, 'epoch': 1.3, 'num_input_tokens_seen': 1920944} +{'loss': 1.2762, 'grad_norm': 0.8615306615829468, 'learning_rate': 2.0329055669814934e-05, 'epoch': 1.3, 'num_input_tokens_seen': 1929216} +{'loss': 1.02, 'grad_norm': 1.18500554561615, 'learning_rate': 1.9909248842397584e-05, 'epoch': 1.31, 'num_input_tokens_seen': 1935792} +{'loss': 1.0995, 'grad_norm': 1.018406867980957, 'learning_rate': 1.9492741804936622e-05, 'epoch': 1.31, 'num_input_tokens_seen': 1942752} +{'loss': 1.2871, 'grad_norm': 0.7798596620559692, 'learning_rate': 1.9079580232122303e-05, 'epoch': 1.32, 'num_input_tokens_seen': 1950528} +{'loss': 1.171, 'grad_norm': 0.9787166714668274, 'learning_rate': 1.866980943177699e-05, 'epoch': 1.32, 'num_input_tokens_seen': 1957392} +{'loss': 1.2077, 'grad_norm': 0.6474674344062805, 'learning_rate': 1.8263474339886628e-05, 'epoch': 1.33, 'num_input_tokens_seen': 1966480} +{'loss': 1.0823, 'grad_norm': 1.035290002822876, 'learning_rate': 1.7860619515673033e-05, 'epoch': 1.33, 'num_input_tokens_seen': 1973968} +{'loss': 1.1719, 'grad_norm': 1.129233717918396, 'learning_rate': 1.746128913670746e-05, 'epoch': 1.34, 'num_input_tokens_seen': 1980416} +{'loss': 1.1616, 'grad_norm': 0.7446464896202087, 'learning_rate': 1.7065526994065973e-05, 'epoch': 1.34, 'num_input_tokens_seen': 1987808} +{'loss': 1.2321, 'grad_norm': 0.6898766160011292, 'learning_rate': 1.667337648752738e-05, 'epoch': 1.35, 'num_input_tokens_seen': 1995728} +{'loss': 1.4247, 'grad_norm': 0.7107200622558594, 'learning_rate': 1.6284880620813848e-05, 'epoch': 1.35, 'num_input_tokens_seen': 2004016} +{'loss': 1.1731, 'grad_norm': 0.6560705304145813, 'learning_rate': 1.5900081996875083e-05, 'epoch': 1.36, 'num_input_tokens_seen': 2012384} +{'loss': 1.0719, 'grad_norm': 0.6298875212669373, 'learning_rate': 1.551902281321651e-05, 'epoch': 1.37, 'num_input_tokens_seen': 2020096} +{'loss': 1.119, 'grad_norm': 1.1242854595184326, 'learning_rate': 1.5141744857271778e-05, 'epoch': 1.37, 'num_input_tokens_seen': 2027056} +{'loss': 1.2802, 'grad_norm': 1.0423098802566528, 'learning_rate': 1.4768289501820265e-05, 'epoch': 1.38, 'num_input_tokens_seen': 2034256} +{'loss': 1.172, 'grad_norm': 0.9645527601242065, 'learning_rate': 1.439869770045018e-05, 'epoch': 1.38, 'num_input_tokens_seen': 2040768} +{'loss': 1.2637, 'grad_norm': 0.6331435441970825, 'learning_rate': 1.4033009983067452e-05, 'epoch': 1.39, 'num_input_tokens_seen': 2047648} +{'loss': 1.1766, 'grad_norm': 1.2376108169555664, 'learning_rate': 1.367126645145121e-05, 'epoch': 1.39, 'num_input_tokens_seen': 2054848} +{'loss': 1.3347, 'grad_norm': 0.7504984736442566, 'learning_rate': 1.3313506774856177e-05, 'epoch': 1.4, 'num_input_tokens_seen': 2063744} +{'loss': 1.2374, 'grad_norm': 0.9626278877258301, 'learning_rate': 1.29597701856625e-05, 'epoch': 1.4, 'num_input_tokens_seen': 2072336} +{'loss': 1.303, 'grad_norm': 0.8408162593841553, 'learning_rate': 1.2610095475073414e-05, 'epoch': 1.41, 'num_input_tokens_seen': 2080992} +{'loss': 1.0784, 'grad_norm': 1.0727360248565674, 'learning_rate': 1.22645209888614e-05, 'epoch': 1.41, 'num_input_tokens_seen': 2087728} +{'loss': 1.2539, 'grad_norm': 0.8654668927192688, 'learning_rate': 1.1923084623163172e-05, 'epoch': 1.42, 'num_input_tokens_seen': 2095968} +{'loss': 1.2818, 'grad_norm': 0.7734363675117493, 'learning_rate': 1.1585823820323843e-05, 'epoch': 1.42, 'num_input_tokens_seen': 2103088} +{'loss': 1.143, 'grad_norm': 0.9622248411178589, 'learning_rate': 1.1252775564791024e-05, 'epoch': 1.43, 'num_input_tokens_seen': 2111408} +{'loss': 1.233, 'grad_norm': 0.634290337562561, 'learning_rate': 1.0923976379059058e-05, 'epoch': 1.43, 'num_input_tokens_seen': 2119632} +{'loss': 0.9924, 'grad_norm': 0.9862900376319885, 'learning_rate': 1.0599462319663905e-05, 'epoch': 1.44, 'num_input_tokens_seen': 2127440} +{'loss': 1.2331, 'grad_norm': 1.0004602670669556, 'learning_rate': 1.0279268973229089e-05, 'epoch': 1.45, 'num_input_tokens_seen': 2137088} +{'loss': 1.196, 'grad_norm': 0.6738834381103516, 'learning_rate': 9.963431452563332e-06, 'epoch': 1.45, 'num_input_tokens_seen': 2144992} +{'loss': 1.0854, 'grad_norm': 0.9389998912811279, 'learning_rate': 9.651984392809914e-06, 'epoch': 1.46, 'num_input_tokens_seen': 2153936} +{'loss': 1.1659, 'grad_norm': 0.8484275937080383, 'learning_rate': 9.344961947648623e-06, 'epoch': 1.46, 'num_input_tokens_seen': 2161760} +{'loss': 1.1839, 'grad_norm': 0.7993901968002319, 'learning_rate': 9.042397785550405e-06, 'epoch': 1.47, 'num_input_tokens_seen': 2168736} +{'loss': 1.2836, 'grad_norm': 0.9979962110519409, 'learning_rate': 8.744325086085248e-06, 'epoch': 1.47, 'num_input_tokens_seen': 2175712} +{'loss': 1.1372, 'grad_norm': 1.106884479522705, 'learning_rate': 8.450776536283594e-06, 'epoch': 1.48, 'num_input_tokens_seen': 2182960} +{'loss': 1.2549, 'grad_norm': 0.8891430497169495, 'learning_rate': 8.16178432705192e-06, 'epoch': 1.48, 'num_input_tokens_seen': 2191232} +{'loss': 1.1984, 'grad_norm': 0.680219829082489, 'learning_rate': 7.877380149642626e-06, 'epoch': 1.49, 'num_input_tokens_seen': 2200064} +{'loss': 1.4679, 'grad_norm': 1.0326625108718872, 'learning_rate': 7.597595192178702e-06, 'epoch': 1.49, 'num_input_tokens_seen': 2209072} +{'loss': 1.3558, 'grad_norm': 0.7626471519470215, 'learning_rate': 7.322460136233622e-06, 'epoch': 1.5, 'num_input_tokens_seen': 2216608} +{'loss': 1.1688, 'grad_norm': 0.655614972114563, 'learning_rate': 7.052005153466779e-06, 'epoch': 1.5, 'num_input_tokens_seen': 2224784} +{'loss': 1.0322, 'grad_norm': 0.9603847861289978, 'learning_rate': 6.786259902314768e-06, 'epoch': 1.51, 'num_input_tokens_seen': 2232640} +{'loss': 1.1789, 'grad_norm': 0.846725583076477, 'learning_rate': 6.52525352473905e-06, 'epoch': 1.51, 'num_input_tokens_seen': 2241184} +{'loss': 1.2104, 'grad_norm': 0.9454957246780396, 'learning_rate': 6.269014643030213e-06, 'epoch': 1.52, 'num_input_tokens_seen': 2248848} +{'loss': 1.2657, 'grad_norm': 0.6011205911636353, 'learning_rate': 6.017571356669183e-06, 'epoch': 1.53, 'num_input_tokens_seen': 2257808} +{'loss': 1.1423, 'grad_norm': 1.0003712177276611, 'learning_rate': 5.770951239245803e-06, 'epoch': 1.53, 'num_input_tokens_seen': 2264848} +{'loss': 1.0841, 'grad_norm': 0.9697505831718445, 'learning_rate': 5.529181335435124e-06, 'epoch': 1.54, 'num_input_tokens_seen': 2271568} +{'loss': 1.2021, 'grad_norm': 0.7945210337638855, 'learning_rate': 5.292288158031594e-06, 'epoch': 1.54, 'num_input_tokens_seen': 2278592} +{'loss': 1.0631, 'grad_norm': 0.8583483099937439, 'learning_rate': 5.060297685041659e-06, 'epoch': 1.55, 'num_input_tokens_seen': 2286304} +{'loss': 1.1668, 'grad_norm': 1.353039026260376, 'learning_rate': 4.833235356834959e-06, 'epoch': 1.55, 'num_input_tokens_seen': 2295040} +{'loss': 1.2205, 'grad_norm': 0.8499441146850586, 'learning_rate': 4.611126073354571e-06, 'epoch': 1.56, 'num_input_tokens_seen': 2304000} +{'loss': 1.1882, 'grad_norm': 1.1828577518463135, 'learning_rate': 4.3939941913863525e-06, 'epoch': 1.56, 'num_input_tokens_seen': 2310384} +{'loss': 1.2363, 'grad_norm': 0.8698590993881226, 'learning_rate': 4.181863521888019e-06, 'epoch': 1.57, 'num_input_tokens_seen': 2317008} +{'loss': 1.1589, 'grad_norm': 1.1912041902542114, 'learning_rate': 3.974757327377981e-06, 'epoch': 1.57, 'num_input_tokens_seen': 2324752} +{'loss': 1.0814, 'grad_norm': 1.479888677597046, 'learning_rate': 3.772698319384349e-06, 'epoch': 1.58, 'num_input_tokens_seen': 2332416} +{'loss': 1.2078, 'grad_norm': 0.7699785232543945, 'learning_rate': 3.575708655954324e-06, 'epoch': 1.58, 'num_input_tokens_seen': 2341936} +{'loss': 1.1234, 'grad_norm': 0.7506076097488403, 'learning_rate': 3.3838099392243916e-06, 'epoch': 1.59, 'num_input_tokens_seen': 2351968} +{'loss': 1.2396, 'grad_norm': 0.8103495836257935, 'learning_rate': 3.197023213051337e-06, 'epoch': 1.59, 'num_input_tokens_seen': 2361200} +{'loss': 1.2116, 'grad_norm': 0.8596040606498718, 'learning_rate': 3.0153689607045845e-06, 'epoch': 1.6, 'num_input_tokens_seen': 2367984} +{'loss': 1.1365, 'grad_norm': 1.0933467149734497, 'learning_rate': 2.8388671026199522e-06, 'epoch': 1.61, 'num_input_tokens_seen': 2375360} +{'loss': 1.2196, 'grad_norm': 0.8759250044822693, 'learning_rate': 2.667536994215186e-06, 'epoch': 1.61, 'num_input_tokens_seen': 2383344} +{'loss': 1.2588, 'grad_norm': 0.8788615465164185, 'learning_rate': 2.501397423767382e-06, 'epoch': 1.62, 'num_input_tokens_seen': 2390464} +{'loss': 1.195, 'grad_norm': 0.8420098423957825, 'learning_rate': 2.340466610352654e-06, 'epoch': 1.62, 'num_input_tokens_seen': 2398736} +{'loss': 1.2392, 'grad_norm': 0.8773792386054993, 'learning_rate': 2.1847622018482283e-06, 'epoch': 1.63, 'num_input_tokens_seen': 2406864} +{'loss': 1.104, 'grad_norm': 1.1017309427261353, 'learning_rate': 2.0343012729971243e-06, 'epoch': 1.63, 'num_input_tokens_seen': 2414304} +{'loss': 1.1601, 'grad_norm': 1.108170986175537, 'learning_rate': 1.8891003235357308e-06, 'epoch': 1.64, 'num_input_tokens_seen': 2421648} +{'loss': 1.2597, 'grad_norm': 0.9205071330070496, 'learning_rate': 1.7491752763844293e-06, 'epoch': 1.64, 'num_input_tokens_seen': 2429856} +{'loss': 1.0914, 'grad_norm': 0.7994181513786316, 'learning_rate': 1.6145414759014431e-06, 'epoch': 1.65, 'num_input_tokens_seen': 2437840} +{'loss': 1.1627, 'grad_norm': 0.8237050771713257, 'learning_rate': 1.4852136862001764e-06, 'epoch': 1.65, 'num_input_tokens_seen': 2446560} +{'loss': 1.1386, 'grad_norm': 1.0996540784835815, 'learning_rate': 1.3612060895301759e-06, 'epoch': 1.66, 'num_input_tokens_seen': 2453360} +{'loss': 1.29, 'grad_norm': 0.7453559637069702, 'learning_rate': 1.2425322847218368e-06, 'epoch': 1.66, 'num_input_tokens_seen': 2461600} +{'loss': 1.0164, 'grad_norm': 1.20688796043396, 'learning_rate': 1.1292052856952062e-06, 'epoch': 1.67, 'num_input_tokens_seen': 2469168} +{'loss': 1.0143, 'grad_norm': 0.9430545568466187, 'learning_rate': 1.0212375200327973e-06, 'epoch': 1.67, 'num_input_tokens_seen': 2476688} +{'loss': 1.1762, 'grad_norm': 0.8898412585258484, 'learning_rate': 9.186408276168013e-07, 'epoch': 1.68, 'num_input_tokens_seen': 2485136} +{'loss': 1.2691, 'grad_norm': 1.0093709230422974, 'learning_rate': 8.214264593307098e-07, 'epoch': 1.69, 'num_input_tokens_seen': 2491568} +{'loss': 1.3668, 'grad_norm': 1.4375550746917725, 'learning_rate': 7.296050758254957e-07, 'epoch': 1.69, 'num_input_tokens_seen': 2499312} +{'loss': 1.3101, 'grad_norm': 0.8531755208969116, 'learning_rate': 6.431867463506048e-07, 'epoch': 1.7, 'num_input_tokens_seen': 2506160} +{'loss': 1.3651, 'grad_norm': 0.8654898405075073, 'learning_rate': 5.621809476497098e-07, 'epoch': 1.7, 'num_input_tokens_seen': 2514000} +{'loss': 1.0573, 'grad_norm': 0.8185416460037231, 'learning_rate': 4.865965629214819e-07, 'epoch': 1.71, 'num_input_tokens_seen': 2520416} +{'loss': 1.3964, 'grad_norm': 0.7298229932785034, 'learning_rate': 4.1644188084548063e-07, 'epoch': 1.71, 'num_input_tokens_seen': 2528112} +{'loss': 1.1651, 'grad_norm': 0.8970651030540466, 'learning_rate': 3.517245946731529e-07, 'epoch': 1.72, 'num_input_tokens_seen': 2535360} +{'loss': 1.1533, 'grad_norm': 0.6972456574440002, 'learning_rate': 2.924518013842303e-07, 'epoch': 1.72, 'num_input_tokens_seen': 2543792} +{'loss': 1.2354, 'grad_norm': 0.727842390537262, 'learning_rate': 2.386300009084408e-07, 'epoch': 1.73, 'num_input_tokens_seen': 2553136} +{'loss': 1.1855, 'grad_norm': 0.913786768913269, 'learning_rate': 1.9026509541272275e-07, 'epoch': 1.73, 'num_input_tokens_seen': 2561840} +{'loss': 0.9797, 'grad_norm': 0.9313849210739136, 'learning_rate': 1.4736238865398765e-07, 'epoch': 1.74, 'num_input_tokens_seen': 2570032} +{'loss': 1.1833, 'grad_norm': 0.8113420009613037, 'learning_rate': 1.0992658539750178e-07, 'epoch': 1.74, 'num_input_tokens_seen': 2577888} +{'loss': 1.3613, 'grad_norm': 0.7111260294914246, 'learning_rate': 7.796179090094891e-08, 'epoch': 1.75, 'num_input_tokens_seen': 2586448} +{'loss': 1.1256, 'grad_norm': 0.909915566444397, 'learning_rate': 5.1471510464268236e-08, 'epoch': 1.75, 'num_input_tokens_seen': 2593744} +{'loss': 1.1567, 'grad_norm': 0.8057591915130615, 'learning_rate': 3.04586490452119e-08, 'epoch': 1.76, 'num_input_tokens_seen': 2602048} +{'loss': 1.302, 'grad_norm': 0.9321501851081848, 'learning_rate': 1.4925510940844156e-08, 'epoch': 1.77, 'num_input_tokens_seen': 2610160} +{'loss': 1.21, 'grad_norm': 1.0042476654052734, 'learning_rate': 4.873799534788059e-09, 'epoch': 1.77, 'num_input_tokens_seen': 2617616} +{'loss': 1.0737, 'grad_norm': 1.1939113140106201, 'learning_rate': 3.0461711048035415e-10, 'epoch': 1.78, 'num_input_tokens_seen': 2624416} +{'eval_loss': 1.258375644683838, 'eval_runtime': 22.1004, 'eval_samples_per_second': 45.248, 'eval_steps_per_second': 22.624, 'epoch': 1.78, 'num_input_tokens_seen': 2626832} +{'train_runtime': 1377.1674, 'train_samples_per_second': 11.618, 'train_steps_per_second': 0.726, 'train_loss': 1.2841649515628815, 'epoch': 1.78, 'num_input_tokens_seen': 2626832} +***** train metrics ***** + epoch = 1.7778 + num_input_tokens_seen = 2626832 + total_flos = 99172908GF + train_loss = 1.2842 + train_runtime = 0:22:57.16 + train_samples_per_second = 11.618 + train_steps_per_second = 0.726 +Figure saved at: ./results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/training_loss.png +Figure saved at: ./results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/training_eval_loss.png +08/21/2024 06:49:16 - WARNING - llamafactory.extras.ploting - No metric eval_accuracy to plot. +***** eval metrics ***** + epoch = 1.7778 + eval_loss = 1.2584 + eval_runtime = 0:00:21.70 + eval_samples_per_second = 46.073 + eval_steps_per_second = 23.037 + num_input_tokens_seen = 2626832 diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/README.md b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/README.md new file mode 100644 index 00000000..8c45e3ac --- /dev/null +++ b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/README.md @@ -0,0 +1,66 @@ +--- +base_model: ../../llm/baichuan +library_name: peft +license: other +tags: +- llama-factory +- lora +- generated_from_trainer +model-index: +- name: Baichuan2_lora_sft_1_single + results: [] +--- + + + +# Baichuan2_lora_sft_1_single + +This model is a fine-tuned version of [../../llm/baichuan](https://huggingface.co/../../llm/baichuan) on the belle_1m dataset. +It achieves the following results on the evaluation set: +- Loss: 1.2584 +- Num Input Tokens Seen: 2626832 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0001 +- train_batch_size: 2 +- eval_batch_size: 2 +- seed: 42 +- gradient_accumulation_steps: 8 +- total_train_batch_size: 16 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- training_steps: 1000 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | Input Tokens Seen | +|:-------------:|:------:|:----:|:---------------:|:-----------------:| +| 1.1711 | 0.8889 | 500 | 1.2699 | 1314896 | +| 1.0737 | 1.7778 | 1000 | 1.2584 | 2626832 | + + +### Framework versions + +- PEFT 0.12.0 +- Transformers 4.43.4 +- Pytorch 2.4.0+cu121 +- Datasets 2.20.0 +- Tokenizers 0.19.1 \ No newline at end of file diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/adapter_config.json b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/adapter_config.json new file mode 100644 index 00000000..1dcc6962 --- /dev/null +++ b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/adapter_config.json @@ -0,0 +1,32 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../../llm/baichuan", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "gate_proj", + "o_proj", + "down_proj", + "W_pack" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/adapter_model.safetensors b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/adapter_model.safetensors new file mode 100644 index 00000000..1a7f499b Binary files /dev/null and b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/adapter_model.safetensors differ diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/all_results.json b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/all_results.json new file mode 100644 index 00000000..f46fee4a --- /dev/null +++ b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/all_results.json @@ -0,0 +1,13 @@ +{ + "epoch": 1.7777777777777777, + "eval_loss": 1.258375644683838, + "eval_runtime": 21.7047, + "eval_samples_per_second": 46.073, + "eval_steps_per_second": 23.037, + "num_input_tokens_seen": 2626832, + "total_flos": 1.0648609969392845e+17, + "train_loss": 1.2841649515628815, + "train_runtime": 1377.1674, + "train_samples_per_second": 11.618, + "train_steps_per_second": 0.726 +} \ No newline at end of file diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/eval_results.json b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/eval_results.json new file mode 100644 index 00000000..a3a5b07b --- /dev/null +++ b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/eval_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 1.7777777777777777, + "eval_loss": 1.258375644683838, + "eval_runtime": 21.7047, + "eval_samples_per_second": 46.073, + "eval_steps_per_second": 23.037, + "num_input_tokens_seen": 2626832 +} \ No newline at end of file diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/special_tokens_map.json b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/special_tokens_map.json new file mode 100644 index 00000000..1e1a9978 --- /dev/null +++ b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/tokenization_baichuan.py b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/tokenization_baichuan.py new file mode 100644 index 00000000..8f95e59c --- /dev/null +++ b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/tokenization_baichuan.py @@ -0,0 +1,252 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm + +from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": {}, + "tokenizer_file": {}, +} +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} + + +class BaiChuanTokenizer(PreTrainedTokenizer): + """ + Construct a BaiChuan tokenizer. Based on byte-level Byte-Pair-Encoding. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + unk_token="", + bos_token="", + eos_token="", + pad_token=None, + sp_model_kwargs: Optional[Dict[str, Any]] = None, + add_bos_token=True, + add_eos_token=False, + clean_up_tokenization_spaces=False, + **kwargs, + ): + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + + self.vocab_file = vocab_file + self.add_bos_token = add_bos_token + self.add_eos_token = add_eos_token + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + add_bos_token=add_bos_token, + add_eos_token=add_eos_token, + sp_model_kwargs=self.sp_model_kwargs, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + @property + def vocab_size(self): + """Returns vocab size""" + return self.sp_model.get_piece_size() + + def get_vocab(self): + """Returns vocab as a dict""" + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text): + """Returns a tokenized string.""" + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.piece_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = self.sp_model.IdToPiece(index) + return token + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for i, token in enumerate(tokens): + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special and i != 0: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string + + def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + bos_token_id = [self.bos_token_id] if self.add_bos_token else [] + eos_token_id = [self.eos_token_id] if self.add_eos_token else [] + + output = bos_token_id + token_ids_0 + eos_token_id + + if token_ids_1 is not None: + output = output + bos_token_id + token_ids_1 + eos_token_id + + return output + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + bos_token_id = [1] if self.add_bos_token else [] + eos_token_id = [1] if self.add_eos_token else [] + + if token_ids_1 is None: + return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + return ( + bos_token_id + + ([0] * len(token_ids_0)) + + eos_token_id + + bos_token_id + + ([0] * len(token_ids_1)) + + eos_token_id + ) + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT + sequence pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + if token_ids_1 is None, only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of ids. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + bos_token_id = [self.bos_token_id] if self.add_bos_token else [] + eos_token_id = [self.eos_token_id] if self.add_eos_token else [] + + output = [0] * len(bos_token_id + token_ids_0 + eos_token_id) + + if token_ids_1 is not None: + output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) + + return output \ No newline at end of file diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/tokenizer.model b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/tokenizer.model new file mode 100644 index 00000000..7980f6b5 Binary files /dev/null and b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/tokenizer.model differ diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/tokenizer_config.json b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/tokenizer_config.json new file mode 100644 index 00000000..748ca8ec --- /dev/null +++ b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/tokenizer_config.json @@ -0,0 +1,46 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "auto_map": { + "AutoTokenizer": [ + "tokenization_baichuan.BaiChuanTokenizer", + null + ] + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "split_special_tokens": false, + "tokenizer_class": "BaiChuanTokenizer", + "unk_token": "" +} diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/train_results.json b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/train_results.json new file mode 100644 index 00000000..74404899 --- /dev/null +++ b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 1.7777777777777777, + "num_input_tokens_seen": 2626832, + "total_flos": 1.0648609969392845e+17, + "train_loss": 1.2841649515628815, + "train_runtime": 1377.1674, + "train_samples_per_second": 11.618, + "train_steps_per_second": 0.726 +} \ No newline at end of file diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/trainer_log.jsonl b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/trainer_log.jsonl new file mode 100644 index 00000000..ad332dd8 --- /dev/null +++ b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/trainer_log.jsonl @@ -0,0 +1,336 @@ +{"current_steps": 3, "total_steps": 1000, "loss": 1.5586, "learning_rate": 3e-06, "epoch": 0.005333333333333333, "percentage": 0.3, "cur_time": "2024-08-21 06:26:23", "elapsed_time": "0:00:04", "remaining_time": "0:26:33", "throughput": "2068.67", "total_tokens": 9920} +{"current_steps": 6, "total_steps": 1000, "loss": 1.6295, "learning_rate": 6e-06, "epoch": 0.010666666666666666, "percentage": 0.6, "cur_time": "2024-08-21 06:26:28", "elapsed_time": "0:00:09", "remaining_time": "0:27:01", "throughput": "1979.61", "total_tokens": 19376} +{"current_steps": 9, "total_steps": 1000, "loss": 1.7438, "learning_rate": 9e-06, "epoch": 0.016, "percentage": 0.9, "cur_time": "2024-08-21 06:26:34", "elapsed_time": "0:00:15", "remaining_time": "0:28:25", "throughput": "1903.82", "total_tokens": 29488} +{"current_steps": 12, "total_steps": 1000, "loss": 1.8413, "learning_rate": 1.2e-05, "epoch": 0.021333333333333333, "percentage": 1.2, "cur_time": "2024-08-21 06:26:38", "elapsed_time": "0:00:19", "remaining_time": "0:27:19", "throughput": "1918.40", "total_tokens": 38208} +{"current_steps": 15, "total_steps": 1000, "loss": 1.5369, "learning_rate": 1.5e-05, "epoch": 0.02666666666666667, "percentage": 1.5, "cur_time": "2024-08-21 06:26:42", "elapsed_time": "0:00:23", "remaining_time": "0:26:04", "throughput": "1873.45", "total_tokens": 44624} +{"current_steps": 18, "total_steps": 1000, "loss": 1.77, "learning_rate": 1.8e-05, "epoch": 0.032, "percentage": 1.8, "cur_time": "2024-08-21 06:26:46", "elapsed_time": "0:00:28", "remaining_time": "0:25:28", "throughput": "1870.81", "total_tokens": 52416} +{"current_steps": 21, "total_steps": 1000, "loss": 1.6273, "learning_rate": 2.1e-05, "epoch": 0.037333333333333336, "percentage": 2.1, "cur_time": "2024-08-21 06:26:50", "elapsed_time": "0:00:31", "remaining_time": "0:24:44", "throughput": "1896.31", "total_tokens": 60368} +{"current_steps": 24, "total_steps": 1000, "loss": 1.9737, "learning_rate": 2.4e-05, "epoch": 0.042666666666666665, "percentage": 2.4, "cur_time": "2024-08-21 06:26:54", "elapsed_time": "0:00:35", "remaining_time": "0:24:01", "throughput": "1884.02", "total_tokens": 66784} +{"current_steps": 27, "total_steps": 1000, "loss": 1.6169, "learning_rate": 2.7000000000000002e-05, "epoch": 0.048, "percentage": 2.7, "cur_time": "2024-08-21 06:26:57", "elapsed_time": "0:00:39", "remaining_time": "0:23:29", "throughput": "1875.64", "total_tokens": 73344} +{"current_steps": 30, "total_steps": 1000, "loss": 1.8011, "learning_rate": 3e-05, "epoch": 0.05333333333333334, "percentage": 3.0, "cur_time": "2024-08-21 06:27:02", "elapsed_time": "0:00:43", "remaining_time": "0:23:18", "throughput": "1896.28", "total_tokens": 82032} +{"current_steps": 33, "total_steps": 1000, "loss": 1.773, "learning_rate": 3.3e-05, "epoch": 0.058666666666666666, "percentage": 3.3, "cur_time": "2024-08-21 06:27:05", "elapsed_time": "0:00:47", "remaining_time": "0:23:00", "throughput": "1906.21", "total_tokens": 89808} +{"current_steps": 36, "total_steps": 1000, "loss": 1.6426, "learning_rate": 3.6e-05, "epoch": 0.064, "percentage": 3.6, "cur_time": "2024-08-21 06:27:09", "elapsed_time": "0:00:50", "remaining_time": "0:22:38", "throughput": "1893.96", "total_tokens": 96080} +{"current_steps": 39, "total_steps": 1000, "loss": 1.548, "learning_rate": 3.9000000000000006e-05, "epoch": 0.06933333333333333, "percentage": 3.9, "cur_time": "2024-08-21 06:27:13", "elapsed_time": "0:00:54", "remaining_time": "0:22:21", "throughput": "1888.53", "total_tokens": 102784} +{"current_steps": 42, "total_steps": 1000, "loss": 1.5749, "learning_rate": 4.2e-05, "epoch": 0.07466666666666667, "percentage": 4.2, "cur_time": "2024-08-21 06:27:17", "elapsed_time": "0:00:58", "remaining_time": "0:22:13", "throughput": "1901.97", "total_tokens": 111184} +{"current_steps": 45, "total_steps": 1000, "loss": 1.7287, "learning_rate": 4.5e-05, "epoch": 0.08, "percentage": 4.5, "cur_time": "2024-08-21 06:27:21", "elapsed_time": "0:01:02", "remaining_time": "0:22:00", "throughput": "1877.30", "total_tokens": 116784} +{"current_steps": 48, "total_steps": 1000, "loss": 1.4529, "learning_rate": 4.8e-05, "epoch": 0.08533333333333333, "percentage": 4.8, "cur_time": "2024-08-21 06:27:25", "elapsed_time": "0:01:06", "remaining_time": "0:21:56", "throughput": "1890.32", "total_tokens": 125472} +{"current_steps": 51, "total_steps": 1000, "loss": 1.6277, "learning_rate": 5.1000000000000006e-05, "epoch": 0.09066666666666667, "percentage": 5.1, "cur_time": "2024-08-21 06:27:29", "elapsed_time": "0:01:10", "remaining_time": "0:21:48", "throughput": "1895.95", "total_tokens": 133360} +{"current_steps": 54, "total_steps": 1000, "loss": 1.6691, "learning_rate": 5.4000000000000005e-05, "epoch": 0.096, "percentage": 5.4, "cur_time": "2024-08-21 06:27:33", "elapsed_time": "0:01:14", "remaining_time": "0:21:40", "throughput": "1890.44", "total_tokens": 140336} +{"current_steps": 57, "total_steps": 1000, "loss": 1.3956, "learning_rate": 5.6999999999999996e-05, "epoch": 0.10133333333333333, "percentage": 5.7, "cur_time": "2024-08-21 06:27:37", "elapsed_time": "0:01:18", "remaining_time": "0:21:35", "throughput": "1892.09", "total_tokens": 148112} +{"current_steps": 60, "total_steps": 1000, "loss": 1.545, "learning_rate": 6e-05, "epoch": 0.10666666666666667, "percentage": 6.0, "cur_time": "2024-08-21 06:27:41", "elapsed_time": "0:01:22", "remaining_time": "0:21:30", "throughput": "1899.83", "total_tokens": 156480} +{"current_steps": 63, "total_steps": 1000, "loss": 1.5088, "learning_rate": 6.3e-05, "epoch": 0.112, "percentage": 6.3, "cur_time": "2024-08-21 06:27:45", "elapsed_time": "0:01:26", "remaining_time": "0:21:23", "throughput": "1901.25", "total_tokens": 164048} +{"current_steps": 66, "total_steps": 1000, "loss": 1.484, "learning_rate": 6.6e-05, "epoch": 0.11733333333333333, "percentage": 6.6, "cur_time": "2024-08-21 06:27:49", "elapsed_time": "0:01:30", "remaining_time": "0:21:18", "throughput": "1906.45", "total_tokens": 172192} +{"current_steps": 69, "total_steps": 1000, "loss": 1.651, "learning_rate": 6.9e-05, "epoch": 0.12266666666666666, "percentage": 6.9, "cur_time": "2024-08-21 06:27:53", "elapsed_time": "0:01:34", "remaining_time": "0:21:13", "throughput": "1912.47", "total_tokens": 180528} +{"current_steps": 72, "total_steps": 1000, "loss": 1.4958, "learning_rate": 7.2e-05, "epoch": 0.128, "percentage": 7.2, "cur_time": "2024-08-21 06:27:57", "elapsed_time": "0:01:38", "remaining_time": "0:21:07", "throughput": "1909.27", "total_tokens": 187728} +{"current_steps": 75, "total_steps": 1000, "loss": 1.499, "learning_rate": 7.500000000000001e-05, "epoch": 0.13333333333333333, "percentage": 7.5, "cur_time": "2024-08-21 06:28:01", "elapsed_time": "0:01:42", "remaining_time": "0:21:01", "throughput": "1902.66", "total_tokens": 194592} +{"current_steps": 78, "total_steps": 1000, "loss": 1.3138, "learning_rate": 7.800000000000001e-05, "epoch": 0.13866666666666666, "percentage": 7.8, "cur_time": "2024-08-21 06:28:04", "elapsed_time": "0:01:46", "remaining_time": "0:20:54", "throughput": "1907.91", "total_tokens": 202416} +{"current_steps": 81, "total_steps": 1000, "loss": 1.2772, "learning_rate": 8.1e-05, "epoch": 0.144, "percentage": 8.1, "cur_time": "2024-08-21 06:28:09", "elapsed_time": "0:01:50", "remaining_time": "0:20:50", "throughput": "1917.40", "total_tokens": 211328} +{"current_steps": 84, "total_steps": 1000, "loss": 1.4917, "learning_rate": 8.4e-05, "epoch": 0.14933333333333335, "percentage": 8.4, "cur_time": "2024-08-21 06:28:12", "elapsed_time": "0:01:54", "remaining_time": "0:20:44", "throughput": "1915.30", "total_tokens": 218544} +{"current_steps": 87, "total_steps": 1000, "loss": 1.4778, "learning_rate": 8.7e-05, "epoch": 0.15466666666666667, "percentage": 8.7, "cur_time": "2024-08-21 06:28:16", "elapsed_time": "0:01:58", "remaining_time": "0:20:39", "throughput": "1927.10", "total_tokens": 227632} +{"current_steps": 90, "total_steps": 1000, "loss": 1.2799, "learning_rate": 9e-05, "epoch": 0.16, "percentage": 9.0, "cur_time": "2024-08-21 06:28:20", "elapsed_time": "0:02:02", "remaining_time": "0:20:34", "throughput": "1928.89", "total_tokens": 235440} +{"current_steps": 93, "total_steps": 1000, "loss": 1.4626, "learning_rate": 9.300000000000001e-05, "epoch": 0.16533333333333333, "percentage": 9.3, "cur_time": "2024-08-21 06:28:24", "elapsed_time": "0:02:06", "remaining_time": "0:20:29", "throughput": "1935.82", "total_tokens": 244032} +{"current_steps": 96, "total_steps": 1000, "loss": 1.5386, "learning_rate": 9.6e-05, "epoch": 0.17066666666666666, "percentage": 9.6, "cur_time": "2024-08-21 06:28:28", "elapsed_time": "0:02:09", "remaining_time": "0:20:22", "throughput": "1929.55", "total_tokens": 250480} +{"current_steps": 99, "total_steps": 1000, "loss": 1.4591, "learning_rate": 9.900000000000001e-05, "epoch": 0.176, "percentage": 9.9, "cur_time": "2024-08-21 06:28:32", "elapsed_time": "0:02:14", "remaining_time": "0:20:19", "throughput": "1937.02", "total_tokens": 259600} +{"current_steps": 102, "total_steps": 1000, "loss": 1.3403, "learning_rate": 9.999878153526974e-05, "epoch": 0.18133333333333335, "percentage": 10.2, "cur_time": "2024-08-21 06:28:36", "elapsed_time": "0:02:18", "remaining_time": "0:20:16", "throughput": "1934.04", "total_tokens": 267216} +{"current_steps": 105, "total_steps": 1000, "loss": 1.2782, "learning_rate": 9.999238475781957e-05, "epoch": 0.18666666666666668, "percentage": 10.5, "cur_time": "2024-08-21 06:28:40", "elapsed_time": "0:02:22", "remaining_time": "0:20:11", "throughput": "1931.97", "total_tokens": 274624} +{"current_steps": 108, "total_steps": 1000, "loss": 1.28, "learning_rate": 9.998050575201771e-05, "epoch": 0.192, "percentage": 10.8, "cur_time": "2024-08-21 06:28:44", "elapsed_time": "0:02:25", "remaining_time": "0:20:05", "throughput": "1924.61", "total_tokens": 280832} +{"current_steps": 111, "total_steps": 1000, "loss": 1.476, "learning_rate": 9.996314582053106e-05, "epoch": 0.19733333333333333, "percentage": 11.1, "cur_time": "2024-08-21 06:28:48", "elapsed_time": "0:02:29", "remaining_time": "0:20:00", "throughput": "1924.75", "total_tokens": 288544} +{"current_steps": 114, "total_steps": 1000, "loss": 1.4419, "learning_rate": 9.99403068670717e-05, "epoch": 0.20266666666666666, "percentage": 11.4, "cur_time": "2024-08-21 06:28:52", "elapsed_time": "0:02:34", "remaining_time": "0:19:57", "throughput": "1925.15", "total_tokens": 296560} +{"current_steps": 117, "total_steps": 1000, "loss": 1.3927, "learning_rate": 9.991199139618827e-05, "epoch": 0.208, "percentage": 11.7, "cur_time": "2024-08-21 06:28:56", "elapsed_time": "0:02:37", "remaining_time": "0:19:51", "throughput": "1923.29", "total_tokens": 303600} +{"current_steps": 120, "total_steps": 1000, "loss": 1.3418, "learning_rate": 9.987820251299122e-05, "epoch": 0.21333333333333335, "percentage": 12.0, "cur_time": "2024-08-21 06:29:00", "elapsed_time": "0:02:41", "remaining_time": "0:19:45", "throughput": "1922.53", "total_tokens": 310800} +{"current_steps": 123, "total_steps": 1000, "loss": 1.2896, "learning_rate": 9.983894392281237e-05, "epoch": 0.21866666666666668, "percentage": 12.3, "cur_time": "2024-08-21 06:29:04", "elapsed_time": "0:02:45", "remaining_time": "0:19:40", "throughput": "1920.99", "total_tokens": 317936} +{"current_steps": 126, "total_steps": 1000, "loss": 1.4422, "learning_rate": 9.979421993079852e-05, "epoch": 0.224, "percentage": 12.6, "cur_time": "2024-08-21 06:29:08", "elapsed_time": "0:02:49", "remaining_time": "0:19:34", "throughput": "1925.15", "total_tokens": 325936} +{"current_steps": 129, "total_steps": 1000, "loss": 1.319, "learning_rate": 9.974403544143941e-05, "epoch": 0.22933333333333333, "percentage": 12.9, "cur_time": "2024-08-21 06:29:12", "elapsed_time": "0:02:53", "remaining_time": "0:19:31", "throughput": "1928.29", "total_tokens": 334592} +{"current_steps": 132, "total_steps": 1000, "loss": 1.1798, "learning_rate": 9.968839595802982e-05, "epoch": 0.23466666666666666, "percentage": 13.2, "cur_time": "2024-08-21 06:29:16", "elapsed_time": "0:02:57", "remaining_time": "0:19:25", "throughput": "1924.33", "total_tokens": 341200} +{"current_steps": 135, "total_steps": 1000, "loss": 1.2938, "learning_rate": 9.962730758206611e-05, "epoch": 0.24, "percentage": 13.5, "cur_time": "2024-08-21 06:29:19", "elapsed_time": "0:03:01", "remaining_time": "0:19:19", "throughput": "1921.99", "total_tokens": 347888} +{"current_steps": 138, "total_steps": 1000, "loss": 1.4067, "learning_rate": 9.956077701257709e-05, "epoch": 0.24533333333333332, "percentage": 13.8, "cur_time": "2024-08-21 06:29:23", "elapsed_time": "0:03:05", "remaining_time": "0:19:16", "throughput": "1927.60", "total_tokens": 356864} +{"current_steps": 141, "total_steps": 1000, "loss": 1.3325, "learning_rate": 9.948881154538945e-05, "epoch": 0.25066666666666665, "percentage": 14.1, "cur_time": "2024-08-21 06:29:28", "elapsed_time": "0:03:09", "remaining_time": "0:19:12", "throughput": "1931.38", "total_tokens": 365440} +{"current_steps": 144, "total_steps": 1000, "loss": 1.3159, "learning_rate": 9.941141907232765e-05, "epoch": 0.256, "percentage": 14.4, "cur_time": "2024-08-21 06:29:31", "elapsed_time": "0:03:13", "remaining_time": "0:19:08", "throughput": "1932.82", "total_tokens": 373312} +{"current_steps": 147, "total_steps": 1000, "loss": 1.4941, "learning_rate": 9.932860808034848e-05, "epoch": 0.2613333333333333, "percentage": 14.7, "cur_time": "2024-08-21 06:29:36", "elapsed_time": "0:03:17", "remaining_time": "0:19:06", "throughput": "1942.07", "total_tokens": 383552} +{"current_steps": 150, "total_steps": 1000, "loss": 1.2139, "learning_rate": 9.924038765061042e-05, "epoch": 0.26666666666666666, "percentage": 15.0, "cur_time": "2024-08-21 06:29:40", "elapsed_time": "0:03:21", "remaining_time": "0:19:00", "throughput": "1941.99", "total_tokens": 390928} +{"current_steps": 153, "total_steps": 1000, "loss": 1.3313, "learning_rate": 9.914676745747772e-05, "epoch": 0.272, "percentage": 15.3, "cur_time": "2024-08-21 06:29:44", "elapsed_time": "0:03:25", "remaining_time": "0:18:56", "throughput": "1940.12", "total_tokens": 398224} +{"current_steps": 156, "total_steps": 1000, "loss": 1.2673, "learning_rate": 9.904775776745958e-05, "epoch": 0.2773333333333333, "percentage": 15.6, "cur_time": "2024-08-21 06:29:48", "elapsed_time": "0:03:29", "remaining_time": "0:18:52", "throughput": "1943.26", "total_tokens": 406656} +{"current_steps": 159, "total_steps": 1000, "loss": 1.357, "learning_rate": 9.894336943808426e-05, "epoch": 0.2826666666666667, "percentage": 15.9, "cur_time": "2024-08-21 06:29:51", "elapsed_time": "0:03:33", "remaining_time": "0:18:47", "throughput": "1940.75", "total_tokens": 413584} +{"current_steps": 162, "total_steps": 1000, "loss": 1.2238, "learning_rate": 9.88336139167084e-05, "epoch": 0.288, "percentage": 16.2, "cur_time": "2024-08-21 06:29:55", "elapsed_time": "0:03:36", "remaining_time": "0:18:41", "throughput": "1938.23", "total_tokens": 420240} +{"current_steps": 165, "total_steps": 1000, "loss": 1.2388, "learning_rate": 9.871850323926177e-05, "epoch": 0.29333333333333333, "percentage": 16.5, "cur_time": "2024-08-21 06:29:59", "elapsed_time": "0:03:40", "remaining_time": "0:18:35", "throughput": "1934.91", "total_tokens": 426608} +{"current_steps": 168, "total_steps": 1000, "loss": 1.3229, "learning_rate": 9.859805002892732e-05, "epoch": 0.2986666666666667, "percentage": 16.8, "cur_time": "2024-08-21 06:30:03", "elapsed_time": "0:03:44", "remaining_time": "0:18:31", "throughput": "1935.33", "total_tokens": 434224} +{"current_steps": 171, "total_steps": 1000, "loss": 1.387, "learning_rate": 9.847226749475695e-05, "epoch": 0.304, "percentage": 17.1, "cur_time": "2024-08-21 06:30:07", "elapsed_time": "0:03:48", "remaining_time": "0:18:28", "throughput": "1941.14", "total_tokens": 443808} +{"current_steps": 174, "total_steps": 1000, "loss": 1.5095, "learning_rate": 9.834116943022298e-05, "epoch": 0.30933333333333335, "percentage": 17.4, "cur_time": "2024-08-21 06:30:11", "elapsed_time": "0:03:52", "remaining_time": "0:18:24", "throughput": "1943.39", "total_tokens": 452080} +{"current_steps": 177, "total_steps": 1000, "loss": 1.2474, "learning_rate": 9.820477021170551e-05, "epoch": 0.31466666666666665, "percentage": 17.7, "cur_time": "2024-08-21 06:30:15", "elapsed_time": "0:03:56", "remaining_time": "0:18:20", "throughput": "1945.08", "total_tokens": 460528} +{"current_steps": 180, "total_steps": 1000, "loss": 1.2774, "learning_rate": 9.806308479691595e-05, "epoch": 0.32, "percentage": 18.0, "cur_time": "2024-08-21 06:30:19", "elapsed_time": "0:04:01", "remaining_time": "0:18:17", "throughput": "1949.48", "total_tokens": 469856} +{"current_steps": 183, "total_steps": 1000, "loss": 1.2836, "learning_rate": 9.791612872325667e-05, "epoch": 0.3253333333333333, "percentage": 18.3, "cur_time": "2024-08-21 06:30:23", "elapsed_time": "0:04:04", "remaining_time": "0:18:12", "throughput": "1949.35", "total_tokens": 477216} +{"current_steps": 186, "total_steps": 1000, "loss": 1.4973, "learning_rate": 9.776391810611718e-05, "epoch": 0.33066666666666666, "percentage": 18.6, "cur_time": "2024-08-21 06:30:27", "elapsed_time": "0:04:08", "remaining_time": "0:18:08", "throughput": "1951.53", "total_tokens": 485392} +{"current_steps": 189, "total_steps": 1000, "loss": 1.2715, "learning_rate": 9.760646963710694e-05, "epoch": 0.336, "percentage": 18.9, "cur_time": "2024-08-21 06:30:31", "elapsed_time": "0:04:12", "remaining_time": "0:18:04", "throughput": "1950.65", "total_tokens": 492832} +{"current_steps": 192, "total_steps": 1000, "loss": 1.303, "learning_rate": 9.744380058222483e-05, "epoch": 0.3413333333333333, "percentage": 19.2, "cur_time": "2024-08-21 06:30:35", "elapsed_time": "0:04:16", "remaining_time": "0:18:00", "throughput": "1948.50", "total_tokens": 500112} +{"current_steps": 195, "total_steps": 1000, "loss": 1.3055, "learning_rate": 9.727592877996585e-05, "epoch": 0.3466666666666667, "percentage": 19.5, "cur_time": "2024-08-21 06:30:39", "elapsed_time": "0:04:20", "remaining_time": "0:17:56", "throughput": "1950.35", "total_tokens": 508384} +{"current_steps": 198, "total_steps": 1000, "loss": 1.551, "learning_rate": 9.710287263936484e-05, "epoch": 0.352, "percentage": 19.8, "cur_time": "2024-08-21 06:30:43", "elapsed_time": "0:04:24", "remaining_time": "0:17:51", "throughput": "1952.16", "total_tokens": 516576} +{"current_steps": 201, "total_steps": 1000, "loss": 1.4759, "learning_rate": 9.69246511379778e-05, "epoch": 0.35733333333333334, "percentage": 20.1, "cur_time": "2024-08-21 06:30:47", "elapsed_time": "0:04:28", "remaining_time": "0:17:48", "throughput": "1948.16", "total_tokens": 523456} +{"current_steps": 204, "total_steps": 1000, "loss": 1.358, "learning_rate": 9.674128381980072e-05, "epoch": 0.3626666666666667, "percentage": 20.4, "cur_time": "2024-08-21 06:30:51", "elapsed_time": "0:04:32", "remaining_time": "0:17:43", "throughput": "1948.20", "total_tokens": 530768} +{"current_steps": 207, "total_steps": 1000, "loss": 1.2757, "learning_rate": 9.655279079312642e-05, "epoch": 0.368, "percentage": 20.7, "cur_time": "2024-08-21 06:30:55", "elapsed_time": "0:04:36", "remaining_time": "0:17:38", "throughput": "1948.52", "total_tokens": 538288} +{"current_steps": 210, "total_steps": 1000, "loss": 1.2448, "learning_rate": 9.635919272833938e-05, "epoch": 0.37333333333333335, "percentage": 21.0, "cur_time": "2024-08-21 06:30:59", "elapsed_time": "0:04:40", "remaining_time": "0:17:34", "throughput": "1950.77", "total_tokens": 546816} +{"current_steps": 213, "total_steps": 1000, "loss": 1.3256, "learning_rate": 9.616051085564906e-05, "epoch": 0.37866666666666665, "percentage": 21.3, "cur_time": "2024-08-21 06:31:03", "elapsed_time": "0:04:44", "remaining_time": "0:17:30", "throughput": "1950.72", "total_tokens": 554800} +{"current_steps": 216, "total_steps": 1000, "loss": 1.254, "learning_rate": 9.595676696276172e-05, "epoch": 0.384, "percentage": 21.6, "cur_time": "2024-08-21 06:31:06", "elapsed_time": "0:04:48", "remaining_time": "0:17:25", "throughput": "1945.59", "total_tokens": 560368} +{"current_steps": 219, "total_steps": 1000, "loss": 1.3174, "learning_rate": 9.574798339249125e-05, "epoch": 0.3893333333333333, "percentage": 21.9, "cur_time": "2024-08-21 06:31:10", "elapsed_time": "0:04:51", "remaining_time": "0:17:21", "throughput": "1943.56", "total_tokens": 567504} +{"current_steps": 222, "total_steps": 1000, "loss": 1.1809, "learning_rate": 9.553418304030886e-05, "epoch": 0.39466666666666667, "percentage": 22.2, "cur_time": "2024-08-21 06:31:14", "elapsed_time": "0:04:56", "remaining_time": "0:17:17", "throughput": "1943.20", "total_tokens": 575440} +{"current_steps": 225, "total_steps": 1000, "loss": 1.3219, "learning_rate": 9.53153893518325e-05, "epoch": 0.4, "percentage": 22.5, "cur_time": "2024-08-21 06:31:19", "elapsed_time": "0:05:00", "remaining_time": "0:17:14", "throughput": "1941.96", "total_tokens": 583024} +{"current_steps": 228, "total_steps": 1000, "loss": 1.4601, "learning_rate": 9.50916263202557e-05, "epoch": 0.4053333333333333, "percentage": 22.8, "cur_time": "2024-08-21 06:31:23", "elapsed_time": "0:05:04", "remaining_time": "0:17:10", "throughput": "1944.53", "total_tokens": 591840} +{"current_steps": 231, "total_steps": 1000, "loss": 1.3049, "learning_rate": 9.486291848371643e-05, "epoch": 0.4106666666666667, "percentage": 23.1, "cur_time": "2024-08-21 06:31:27", "elapsed_time": "0:05:08", "remaining_time": "0:17:06", "throughput": "1944.29", "total_tokens": 599392} +{"current_steps": 234, "total_steps": 1000, "loss": 1.4101, "learning_rate": 9.462929092260628e-05, "epoch": 0.416, "percentage": 23.4, "cur_time": "2024-08-21 06:31:31", "elapsed_time": "0:05:12", "remaining_time": "0:17:02", "throughput": "1941.92", "total_tokens": 606560} +{"current_steps": 237, "total_steps": 1000, "loss": 1.1738, "learning_rate": 9.439076925682006e-05, "epoch": 0.42133333333333334, "percentage": 23.7, "cur_time": "2024-08-21 06:31:35", "elapsed_time": "0:05:16", "remaining_time": "0:16:58", "throughput": "1943.90", "total_tokens": 615216} +{"current_steps": 240, "total_steps": 1000, "loss": 1.3717, "learning_rate": 9.414737964294636e-05, "epoch": 0.4266666666666667, "percentage": 24.0, "cur_time": "2024-08-21 06:31:39", "elapsed_time": "0:05:20", "remaining_time": "0:16:54", "throughput": "1946.10", "total_tokens": 623696} +{"current_steps": 243, "total_steps": 1000, "loss": 1.3903, "learning_rate": 9.389914877139903e-05, "epoch": 0.432, "percentage": 24.3, "cur_time": "2024-08-21 06:31:43", "elapsed_time": "0:05:24", "remaining_time": "0:16:50", "throughput": "1946.75", "total_tokens": 631568} +{"current_steps": 246, "total_steps": 1000, "loss": 1.3946, "learning_rate": 9.364610386349049e-05, "epoch": 0.43733333333333335, "percentage": 24.6, "cur_time": "2024-08-21 06:31:47", "elapsed_time": "0:05:28", "remaining_time": "0:16:46", "throughput": "1948.16", "total_tokens": 639424} +{"current_steps": 249, "total_steps": 1000, "loss": 1.22, "learning_rate": 9.338827266844644e-05, "epoch": 0.44266666666666665, "percentage": 24.9, "cur_time": "2024-08-21 06:31:50", "elapsed_time": "0:05:32", "remaining_time": "0:16:41", "throughput": "1949.77", "total_tokens": 647696} +{"current_steps": 252, "total_steps": 1000, "loss": 1.3776, "learning_rate": 9.312568346036288e-05, "epoch": 0.448, "percentage": 25.2, "cur_time": "2024-08-21 06:31:54", "elapsed_time": "0:05:36", "remaining_time": "0:16:37", "throughput": "1950.75", "total_tokens": 655600} +{"current_steps": 255, "total_steps": 1000, "loss": 1.3568, "learning_rate": 9.285836503510562e-05, "epoch": 0.4533333333333333, "percentage": 25.5, "cur_time": "2024-08-21 06:31:58", "elapsed_time": "0:05:40", "remaining_time": "0:16:33", "throughput": "1950.45", "total_tokens": 663200} +{"current_steps": 258, "total_steps": 1000, "loss": 1.2346, "learning_rate": 9.258634670715238e-05, "epoch": 0.45866666666666667, "percentage": 25.8, "cur_time": "2024-08-21 06:32:02", "elapsed_time": "0:05:43", "remaining_time": "0:16:29", "throughput": "1948.83", "total_tokens": 670256} +{"current_steps": 261, "total_steps": 1000, "loss": 1.4332, "learning_rate": 9.230965830637821e-05, "epoch": 0.464, "percentage": 26.1, "cur_time": "2024-08-21 06:32:06", "elapsed_time": "0:05:47", "remaining_time": "0:16:24", "throughput": "1950.00", "total_tokens": 678304} +{"current_steps": 264, "total_steps": 1000, "loss": 1.3245, "learning_rate": 9.202833017478422e-05, "epoch": 0.4693333333333333, "percentage": 26.4, "cur_time": "2024-08-21 06:32:10", "elapsed_time": "0:05:52", "remaining_time": "0:16:21", "throughput": "1952.73", "total_tokens": 687392} +{"current_steps": 267, "total_steps": 1000, "loss": 1.3583, "learning_rate": 9.174239316317033e-05, "epoch": 0.4746666666666667, "percentage": 26.7, "cur_time": "2024-08-21 06:32:14", "elapsed_time": "0:05:56", "remaining_time": "0:16:17", "throughput": "1953.21", "total_tokens": 695344} +{"current_steps": 270, "total_steps": 1000, "loss": 1.271, "learning_rate": 9.145187862775209e-05, "epoch": 0.48, "percentage": 27.0, "cur_time": "2024-08-21 06:32:19", "elapsed_time": "0:06:00", "remaining_time": "0:16:14", "throughput": "1955.94", "total_tokens": 704736} +{"current_steps": 273, "total_steps": 1000, "loss": 1.207, "learning_rate": 9.11568184267221e-05, "epoch": 0.48533333333333334, "percentage": 27.3, "cur_time": "2024-08-21 06:32:22", "elapsed_time": "0:06:04", "remaining_time": "0:16:09", "throughput": "1952.45", "total_tokens": 710720} +{"current_steps": 276, "total_steps": 1000, "loss": 1.1755, "learning_rate": 9.085724491675642e-05, "epoch": 0.49066666666666664, "percentage": 27.6, "cur_time": "2024-08-21 06:32:26", "elapsed_time": "0:06:07", "remaining_time": "0:16:05", "throughput": "1951.85", "total_tokens": 718128} +{"current_steps": 279, "total_steps": 1000, "loss": 1.2883, "learning_rate": 9.055319094946633e-05, "epoch": 0.496, "percentage": 27.9, "cur_time": "2024-08-21 06:32:30", "elapsed_time": "0:06:11", "remaining_time": "0:16:01", "throughput": "1951.12", "total_tokens": 725568} +{"current_steps": 282, "total_steps": 1000, "loss": 1.314, "learning_rate": 9.02446898677957e-05, "epoch": 0.5013333333333333, "percentage": 28.2, "cur_time": "2024-08-21 06:32:34", "elapsed_time": "0:06:15", "remaining_time": "0:15:56", "throughput": "1950.72", "total_tokens": 732928} +{"current_steps": 285, "total_steps": 1000, "loss": 1.2864, "learning_rate": 8.993177550236464e-05, "epoch": 0.5066666666666667, "percentage": 28.5, "cur_time": "2024-08-21 06:32:38", "elapsed_time": "0:06:19", "remaining_time": "0:15:52", "throughput": "1952.11", "total_tokens": 741536} +{"current_steps": 288, "total_steps": 1000, "loss": 1.227, "learning_rate": 8.961448216775954e-05, "epoch": 0.512, "percentage": 28.8, "cur_time": "2024-08-21 06:32:42", "elapsed_time": "0:06:23", "remaining_time": "0:15:48", "throughput": "1950.39", "total_tokens": 748288} +{"current_steps": 291, "total_steps": 1000, "loss": 1.3016, "learning_rate": 8.92928446587701e-05, "epoch": 0.5173333333333333, "percentage": 29.1, "cur_time": "2024-08-21 06:32:46", "elapsed_time": "0:06:27", "remaining_time": "0:15:44", "throughput": "1952.90", "total_tokens": 757216} +{"current_steps": 294, "total_steps": 1000, "loss": 1.2786, "learning_rate": 8.896689824657372e-05, "epoch": 0.5226666666666666, "percentage": 29.4, "cur_time": "2024-08-21 06:32:50", "elapsed_time": "0:06:31", "remaining_time": "0:15:40", "throughput": "1951.59", "total_tokens": 764240} +{"current_steps": 297, "total_steps": 1000, "loss": 1.3358, "learning_rate": 8.863667867486756e-05, "epoch": 0.528, "percentage": 29.7, "cur_time": "2024-08-21 06:32:54", "elapsed_time": "0:06:36", "remaining_time": "0:15:37", "throughput": "1954.88", "total_tokens": 774416} +{"current_steps": 300, "total_steps": 1000, "loss": 1.3911, "learning_rate": 8.83022221559489e-05, "epoch": 0.5333333333333333, "percentage": 30.0, "cur_time": "2024-08-21 06:32:59", "elapsed_time": "0:06:40", "remaining_time": "0:15:34", "throughput": "1955.22", "total_tokens": 782656} +{"current_steps": 303, "total_steps": 1000, "loss": 1.193, "learning_rate": 8.796356536674403e-05, "epoch": 0.5386666666666666, "percentage": 30.3, "cur_time": "2024-08-21 06:33:03", "elapsed_time": "0:06:44", "remaining_time": "0:15:30", "throughput": "1953.12", "total_tokens": 790032} +{"current_steps": 306, "total_steps": 1000, "loss": 1.2034, "learning_rate": 8.762074544478623e-05, "epoch": 0.544, "percentage": 30.6, "cur_time": "2024-08-21 06:33:07", "elapsed_time": "0:06:48", "remaining_time": "0:15:26", "throughput": "1953.60", "total_tokens": 798048} +{"current_steps": 309, "total_steps": 1000, "loss": 1.1278, "learning_rate": 8.727379998414311e-05, "epoch": 0.5493333333333333, "percentage": 30.9, "cur_time": "2024-08-21 06:33:11", "elapsed_time": "0:06:52", "remaining_time": "0:15:22", "throughput": "1953.87", "total_tokens": 805792} +{"current_steps": 312, "total_steps": 1000, "loss": 1.3952, "learning_rate": 8.692276703129421e-05, "epoch": 0.5546666666666666, "percentage": 31.2, "cur_time": "2024-08-21 06:33:15", "elapsed_time": "0:06:56", "remaining_time": "0:15:17", "throughput": "1953.86", "total_tokens": 813280} +{"current_steps": 315, "total_steps": 1000, "loss": 1.3797, "learning_rate": 8.656768508095853e-05, "epoch": 0.56, "percentage": 31.5, "cur_time": "2024-08-21 06:33:19", "elapsed_time": "0:07:00", "remaining_time": "0:15:13", "throughput": "1956.93", "total_tokens": 822464} +{"current_steps": 318, "total_steps": 1000, "loss": 1.311, "learning_rate": 8.620859307187339e-05, "epoch": 0.5653333333333334, "percentage": 31.8, "cur_time": "2024-08-21 06:33:23", "elapsed_time": "0:07:04", "remaining_time": "0:15:10", "throughput": "1960.90", "total_tokens": 832672} +{"current_steps": 321, "total_steps": 1000, "loss": 1.3417, "learning_rate": 8.584553038252414e-05, "epoch": 0.5706666666666667, "percentage": 32.1, "cur_time": "2024-08-21 06:33:27", "elapsed_time": "0:07:08", "remaining_time": "0:15:06", "throughput": "1962.32", "total_tokens": 841248} +{"current_steps": 324, "total_steps": 1000, "loss": 1.2492, "learning_rate": 8.547853682682604e-05, "epoch": 0.576, "percentage": 32.4, "cur_time": "2024-08-21 06:33:31", "elapsed_time": "0:07:12", "remaining_time": "0:15:02", "throughput": "1961.46", "total_tokens": 848432} +{"current_steps": 327, "total_steps": 1000, "loss": 1.4109, "learning_rate": 8.510765264975813e-05, "epoch": 0.5813333333333334, "percentage": 32.7, "cur_time": "2024-08-21 06:33:35", "elapsed_time": "0:07:16", "remaining_time": "0:14:59", "throughput": "1964.03", "total_tokens": 858096} +{"current_steps": 330, "total_steps": 1000, "loss": 1.3382, "learning_rate": 8.473291852294987e-05, "epoch": 0.5866666666666667, "percentage": 33.0, "cur_time": "2024-08-21 06:33:39", "elapsed_time": "0:07:20", "remaining_time": "0:14:55", "throughput": "1965.90", "total_tokens": 866784} +{"current_steps": 333, "total_steps": 1000, "loss": 1.2248, "learning_rate": 8.435437554022115e-05, "epoch": 0.592, "percentage": 33.3, "cur_time": "2024-08-21 06:33:43", "elapsed_time": "0:07:24", "remaining_time": "0:14:50", "throughput": "1965.02", "total_tokens": 873904} +{"current_steps": 336, "total_steps": 1000, "loss": 1.1435, "learning_rate": 8.397206521307584e-05, "epoch": 0.5973333333333334, "percentage": 33.6, "cur_time": "2024-08-21 06:33:47", "elapsed_time": "0:07:28", "remaining_time": "0:14:46", "throughput": "1962.16", "total_tokens": 879856} +{"current_steps": 339, "total_steps": 1000, "loss": 1.1193, "learning_rate": 8.358602946614951e-05, "epoch": 0.6026666666666667, "percentage": 33.9, "cur_time": "2024-08-21 06:33:51", "elapsed_time": "0:07:32", "remaining_time": "0:14:41", "throughput": "1961.25", "total_tokens": 887152} +{"current_steps": 342, "total_steps": 1000, "loss": 1.2794, "learning_rate": 8.319631063261209e-05, "epoch": 0.608, "percentage": 34.2, "cur_time": "2024-08-21 06:33:55", "elapsed_time": "0:07:36", "remaining_time": "0:14:39", "throughput": "1962.75", "total_tokens": 896752} +{"current_steps": 345, "total_steps": 1000, "loss": 1.2186, "learning_rate": 8.280295144952536e-05, "epoch": 0.6133333333333333, "percentage": 34.5, "cur_time": "2024-08-21 06:33:59", "elapsed_time": "0:07:41", "remaining_time": "0:14:35", "throughput": "1964.05", "total_tokens": 905696} +{"current_steps": 348, "total_steps": 1000, "loss": 1.2683, "learning_rate": 8.240599505315655e-05, "epoch": 0.6186666666666667, "percentage": 34.8, "cur_time": "2024-08-21 06:34:03", "elapsed_time": "0:07:45", "remaining_time": "0:14:31", "throughput": "1964.13", "total_tokens": 913536} +{"current_steps": 351, "total_steps": 1000, "loss": 1.1247, "learning_rate": 8.200548497424778e-05, "epoch": 0.624, "percentage": 35.1, "cur_time": "2024-08-21 06:34:07", "elapsed_time": "0:07:48", "remaining_time": "0:14:26", "throughput": "1961.76", "total_tokens": 919584} +{"current_steps": 354, "total_steps": 1000, "loss": 1.4098, "learning_rate": 8.160146513324254e-05, "epoch": 0.6293333333333333, "percentage": 35.4, "cur_time": "2024-08-21 06:34:11", "elapsed_time": "0:07:52", "remaining_time": "0:14:22", "throughput": "1961.81", "total_tokens": 927248} +{"current_steps": 357, "total_steps": 1000, "loss": 1.2803, "learning_rate": 8.119397983546932e-05, "epoch": 0.6346666666666667, "percentage": 35.7, "cur_time": "2024-08-21 06:34:15", "elapsed_time": "0:07:57", "remaining_time": "0:14:19", "throughput": "1964.83", "total_tokens": 937328} +{"current_steps": 360, "total_steps": 1000, "loss": 1.4474, "learning_rate": 8.07830737662829e-05, "epoch": 0.64, "percentage": 36.0, "cur_time": "2024-08-21 06:34:19", "elapsed_time": "0:08:01", "remaining_time": "0:14:15", "throughput": "1966.78", "total_tokens": 946400} +{"current_steps": 363, "total_steps": 1000, "loss": 1.3626, "learning_rate": 8.036879198616434e-05, "epoch": 0.6453333333333333, "percentage": 36.3, "cur_time": "2024-08-21 06:34:24", "elapsed_time": "0:08:05", "remaining_time": "0:14:11", "throughput": "1967.55", "total_tokens": 954912} +{"current_steps": 366, "total_steps": 1000, "loss": 1.2167, "learning_rate": 7.99511799257793e-05, "epoch": 0.6506666666666666, "percentage": 36.6, "cur_time": "2024-08-21 06:34:28", "elapsed_time": "0:08:09", "remaining_time": "0:14:07", "throughput": "1968.18", "total_tokens": 963056} +{"current_steps": 369, "total_steps": 1000, "loss": 1.2025, "learning_rate": 7.953028338099627e-05, "epoch": 0.656, "percentage": 36.9, "cur_time": "2024-08-21 06:34:31", "elapsed_time": "0:08:13", "remaining_time": "0:14:03", "throughput": "1967.33", "total_tokens": 970048} +{"current_steps": 372, "total_steps": 1000, "loss": 1.313, "learning_rate": 7.910614850786448e-05, "epoch": 0.6613333333333333, "percentage": 37.2, "cur_time": "2024-08-21 06:34:35", "elapsed_time": "0:08:16", "remaining_time": "0:13:58", "throughput": "1966.65", "total_tokens": 977312} +{"current_steps": 375, "total_steps": 1000, "loss": 1.479, "learning_rate": 7.86788218175523e-05, "epoch": 0.6666666666666666, "percentage": 37.5, "cur_time": "2024-08-21 06:34:39", "elapsed_time": "0:08:20", "remaining_time": "0:13:54", "throughput": "1965.59", "total_tokens": 984576} +{"current_steps": 378, "total_steps": 1000, "loss": 1.435, "learning_rate": 7.82483501712469e-05, "epoch": 0.672, "percentage": 37.8, "cur_time": "2024-08-21 06:34:43", "elapsed_time": "0:08:24", "remaining_time": "0:13:50", "throughput": "1965.97", "total_tokens": 992464} +{"current_steps": 381, "total_steps": 1000, "loss": 1.1344, "learning_rate": 7.781478077501525e-05, "epoch": 0.6773333333333333, "percentage": 38.1, "cur_time": "2024-08-21 06:34:47", "elapsed_time": "0:08:28", "remaining_time": "0:13:46", "throughput": "1966.42", "total_tokens": 1000400} +{"current_steps": 384, "total_steps": 1000, "loss": 1.2989, "learning_rate": 7.737816117462752e-05, "epoch": 0.6826666666666666, "percentage": 38.4, "cur_time": "2024-08-21 06:34:51", "elapsed_time": "0:08:32", "remaining_time": "0:13:42", "throughput": "1966.45", "total_tokens": 1007952} +{"current_steps": 387, "total_steps": 1000, "loss": 1.0671, "learning_rate": 7.693853925034315e-05, "epoch": 0.688, "percentage": 38.7, "cur_time": "2024-08-21 06:34:55", "elapsed_time": "0:08:36", "remaining_time": "0:13:37", "throughput": "1963.85", "total_tokens": 1014064} +{"current_steps": 390, "total_steps": 1000, "loss": 1.2786, "learning_rate": 7.649596321166024e-05, "epoch": 0.6933333333333334, "percentage": 39.0, "cur_time": "2024-08-21 06:34:59", "elapsed_time": "0:08:40", "remaining_time": "0:13:33", "throughput": "1962.97", "total_tokens": 1021472} +{"current_steps": 393, "total_steps": 1000, "loss": 1.4096, "learning_rate": 7.605048159202883e-05, "epoch": 0.6986666666666667, "percentage": 39.3, "cur_time": "2024-08-21 06:35:02", "elapsed_time": "0:08:44", "remaining_time": "0:13:29", "throughput": "1962.67", "total_tokens": 1028848} +{"current_steps": 396, "total_steps": 1000, "loss": 1.1785, "learning_rate": 7.560214324352858e-05, "epoch": 0.704, "percentage": 39.6, "cur_time": "2024-08-21 06:35:07", "elapsed_time": "0:08:48", "remaining_time": "0:13:25", "throughput": "1963.90", "total_tokens": 1037760} +{"current_steps": 399, "total_steps": 1000, "loss": 1.2757, "learning_rate": 7.515099733151177e-05, "epoch": 0.7093333333333334, "percentage": 39.9, "cur_time": "2024-08-21 06:35:11", "elapsed_time": "0:08:52", "remaining_time": "0:13:21", "throughput": "1963.96", "total_tokens": 1045472} +{"current_steps": 402, "total_steps": 1000, "loss": 1.2499, "learning_rate": 7.469709332921155e-05, "epoch": 0.7146666666666667, "percentage": 40.2, "cur_time": "2024-08-21 06:35:15", "elapsed_time": "0:08:56", "remaining_time": "0:13:18", "throughput": "1962.39", "total_tokens": 1053024} +{"current_steps": 405, "total_steps": 1000, "loss": 1.2605, "learning_rate": 7.424048101231686e-05, "epoch": 0.72, "percentage": 40.5, "cur_time": "2024-08-21 06:35:19", "elapsed_time": "0:09:00", "remaining_time": "0:13:14", "throughput": "1961.13", "total_tokens": 1060336} +{"current_steps": 408, "total_steps": 1000, "loss": 1.2975, "learning_rate": 7.378121045351378e-05, "epoch": 0.7253333333333334, "percentage": 40.8, "cur_time": "2024-08-21 06:35:23", "elapsed_time": "0:09:04", "remaining_time": "0:13:10", "throughput": "1959.94", "total_tokens": 1067184} +{"current_steps": 411, "total_steps": 1000, "loss": 1.1854, "learning_rate": 7.331933201699457e-05, "epoch": 0.7306666666666667, "percentage": 41.1, "cur_time": "2024-08-21 06:35:27", "elapsed_time": "0:09:08", "remaining_time": "0:13:05", "throughput": "1959.63", "total_tokens": 1074768} +{"current_steps": 414, "total_steps": 1000, "loss": 1.1631, "learning_rate": 7.285489635293472e-05, "epoch": 0.736, "percentage": 41.4, "cur_time": "2024-08-21 06:35:31", "elapsed_time": "0:09:12", "remaining_time": "0:13:02", "throughput": "1960.66", "total_tokens": 1083360} +{"current_steps": 417, "total_steps": 1000, "loss": 1.2159, "learning_rate": 7.238795439193848e-05, "epoch": 0.7413333333333333, "percentage": 41.7, "cur_time": "2024-08-21 06:35:35", "elapsed_time": "0:09:16", "remaining_time": "0:12:57", "throughput": "1961.16", "total_tokens": 1091200} +{"current_steps": 420, "total_steps": 1000, "loss": 1.201, "learning_rate": 7.191855733945387e-05, "epoch": 0.7466666666666667, "percentage": 42.0, "cur_time": "2024-08-21 06:35:39", "elapsed_time": "0:09:20", "remaining_time": "0:12:54", "throughput": "1963.54", "total_tokens": 1100976} +{"current_steps": 423, "total_steps": 1000, "loss": 1.3129, "learning_rate": 7.14467566701573e-05, "epoch": 0.752, "percentage": 42.3, "cur_time": "2024-08-21 06:35:43", "elapsed_time": "0:09:24", "remaining_time": "0:12:50", "throughput": "1964.23", "total_tokens": 1109264} +{"current_steps": 426, "total_steps": 1000, "loss": 1.5195, "learning_rate": 7.097260412230886e-05, "epoch": 0.7573333333333333, "percentage": 42.6, "cur_time": "2024-08-21 06:35:47", "elapsed_time": "0:09:28", "remaining_time": "0:12:46", "throughput": "1965.11", "total_tokens": 1117568} +{"current_steps": 429, "total_steps": 1000, "loss": 1.3843, "learning_rate": 7.049615169207864e-05, "epoch": 0.7626666666666667, "percentage": 42.9, "cur_time": "2024-08-21 06:35:51", "elapsed_time": "0:09:32", "remaining_time": "0:12:42", "throughput": "1966.48", "total_tokens": 1126416} +{"current_steps": 432, "total_steps": 1000, "loss": 1.1864, "learning_rate": 7.001745162784477e-05, "epoch": 0.768, "percentage": 43.2, "cur_time": "2024-08-21 06:35:55", "elapsed_time": "0:09:36", "remaining_time": "0:12:38", "throughput": "1966.63", "total_tokens": 1133984} +{"current_steps": 435, "total_steps": 1000, "loss": 1.4373, "learning_rate": 6.953655642446368e-05, "epoch": 0.7733333333333333, "percentage": 43.5, "cur_time": "2024-08-21 06:35:59", "elapsed_time": "0:09:40", "remaining_time": "0:12:34", "throughput": "1967.68", "total_tokens": 1142608} +{"current_steps": 438, "total_steps": 1000, "loss": 1.2651, "learning_rate": 6.905351881751372e-05, "epoch": 0.7786666666666666, "percentage": 43.8, "cur_time": "2024-08-21 06:36:03", "elapsed_time": "0:09:45", "remaining_time": "0:12:30", "throughput": "1969.64", "total_tokens": 1152304} +{"current_steps": 441, "total_steps": 1000, "loss": 1.2864, "learning_rate": 6.856839177751176e-05, "epoch": 0.784, "percentage": 44.1, "cur_time": "2024-08-21 06:36:08", "elapsed_time": "0:09:49", "remaining_time": "0:12:27", "throughput": "1970.11", "total_tokens": 1161072} +{"current_steps": 444, "total_steps": 1000, "loss": 1.1525, "learning_rate": 6.808122850410461e-05, "epoch": 0.7893333333333333, "percentage": 44.4, "cur_time": "2024-08-21 06:36:12", "elapsed_time": "0:09:53", "remaining_time": "0:12:22", "throughput": "1969.54", "total_tokens": 1168480} +{"current_steps": 447, "total_steps": 1000, "loss": 1.3601, "learning_rate": 6.759208242023509e-05, "epoch": 0.7946666666666666, "percentage": 44.7, "cur_time": "2024-08-21 06:36:15", "elapsed_time": "0:09:57", "remaining_time": "0:12:18", "throughput": "1968.55", "total_tokens": 1175392} +{"current_steps": 450, "total_steps": 1000, "loss": 1.4018, "learning_rate": 6.710100716628344e-05, "epoch": 0.8, "percentage": 45.0, "cur_time": "2024-08-21 06:36:20", "elapsed_time": "0:10:01", "remaining_time": "0:12:14", "throughput": "1970.02", "total_tokens": 1184544} +{"current_steps": 453, "total_steps": 1000, "loss": 1.2425, "learning_rate": 6.660805659418516e-05, "epoch": 0.8053333333333333, "percentage": 45.3, "cur_time": "2024-08-21 06:36:24", "elapsed_time": "0:10:05", "remaining_time": "0:12:10", "throughput": "1970.55", "total_tokens": 1192880} +{"current_steps": 456, "total_steps": 1000, "loss": 1.2589, "learning_rate": 6.611328476152557e-05, "epoch": 0.8106666666666666, "percentage": 45.6, "cur_time": "2024-08-21 06:36:28", "elapsed_time": "0:10:09", "remaining_time": "0:12:06", "throughput": "1971.08", "total_tokens": 1200928} +{"current_steps": 459, "total_steps": 1000, "loss": 1.3289, "learning_rate": 6.561674592561163e-05, "epoch": 0.816, "percentage": 45.9, "cur_time": "2024-08-21 06:36:32", "elapsed_time": "0:10:13", "remaining_time": "0:12:02", "throughput": "1971.41", "total_tokens": 1209056} +{"current_steps": 462, "total_steps": 1000, "loss": 1.2851, "learning_rate": 6.511849453752223e-05, "epoch": 0.8213333333333334, "percentage": 46.2, "cur_time": "2024-08-21 06:36:35", "elapsed_time": "0:10:17", "remaining_time": "0:11:58", "throughput": "1971.88", "total_tokens": 1217040} +{"current_steps": 465, "total_steps": 1000, "loss": 1.2584, "learning_rate": 6.461858523613684e-05, "epoch": 0.8266666666666667, "percentage": 46.5, "cur_time": "2024-08-21 06:36:39", "elapsed_time": "0:10:21", "remaining_time": "0:11:54", "throughput": "1970.38", "total_tokens": 1223712} +{"current_steps": 468, "total_steps": 1000, "loss": 1.2729, "learning_rate": 6.411707284214384e-05, "epoch": 0.832, "percentage": 46.8, "cur_time": "2024-08-21 06:36:43", "elapsed_time": "0:10:24", "remaining_time": "0:11:50", "throughput": "1969.84", "total_tokens": 1230736} +{"current_steps": 471, "total_steps": 1000, "loss": 1.3603, "learning_rate": 6.361401235202872e-05, "epoch": 0.8373333333333334, "percentage": 47.1, "cur_time": "2024-08-21 06:36:47", "elapsed_time": "0:10:28", "remaining_time": "0:11:45", "throughput": "1969.18", "total_tokens": 1237728} +{"current_steps": 474, "total_steps": 1000, "loss": 1.1509, "learning_rate": 6.310945893204324e-05, "epoch": 0.8426666666666667, "percentage": 47.4, "cur_time": "2024-08-21 06:36:51", "elapsed_time": "0:10:33", "remaining_time": "0:11:42", "throughput": "1970.80", "total_tokens": 1247568} +{"current_steps": 477, "total_steps": 1000, "loss": 1.3986, "learning_rate": 6.26034679121557e-05, "epoch": 0.848, "percentage": 47.7, "cur_time": "2024-08-21 06:36:55", "elapsed_time": "0:10:37", "remaining_time": "0:11:38", "throughput": "1971.33", "total_tokens": 1255968} +{"current_steps": 480, "total_steps": 1000, "loss": 1.3266, "learning_rate": 6.209609477998338e-05, "epoch": 0.8533333333333334, "percentage": 48.0, "cur_time": "2024-08-21 06:36:59", "elapsed_time": "0:10:40", "remaining_time": "0:11:34", "throughput": "1969.90", "total_tokens": 1262640} +{"current_steps": 483, "total_steps": 1000, "loss": 1.5078, "learning_rate": 6.158739517470786e-05, "epoch": 0.8586666666666667, "percentage": 48.3, "cur_time": "2024-08-21 06:37:03", "elapsed_time": "0:10:44", "remaining_time": "0:11:30", "throughput": "1969.83", "total_tokens": 1270464} +{"current_steps": 486, "total_steps": 1000, "loss": 1.0109, "learning_rate": 6.107742488097338e-05, "epoch": 0.864, "percentage": 48.6, "cur_time": "2024-08-21 06:37:07", "elapsed_time": "0:10:49", "remaining_time": "0:11:26", "throughput": "1969.68", "total_tokens": 1278400} +{"current_steps": 489, "total_steps": 1000, "loss": 1.3723, "learning_rate": 6.056623982276944e-05, "epoch": 0.8693333333333333, "percentage": 48.9, "cur_time": "2024-08-21 06:37:12", "elapsed_time": "0:10:53", "remaining_time": "0:11:22", "throughput": "1970.34", "total_tokens": 1287072} +{"current_steps": 492, "total_steps": 1000, "loss": 1.3698, "learning_rate": 6.005389605729824e-05, "epoch": 0.8746666666666667, "percentage": 49.2, "cur_time": "2024-08-21 06:37:16", "elapsed_time": "0:10:57", "remaining_time": "0:11:18", "throughput": "1970.13", "total_tokens": 1294784} +{"current_steps": 495, "total_steps": 1000, "loss": 1.2799, "learning_rate": 5.9540449768827246e-05, "epoch": 0.88, "percentage": 49.5, "cur_time": "2024-08-21 06:37:20", "elapsed_time": "0:11:01", "remaining_time": "0:11:14", "throughput": "1970.98", "total_tokens": 1303872} +{"current_steps": 498, "total_steps": 1000, "loss": 1.1711, "learning_rate": 5.902595726252801e-05, "epoch": 0.8853333333333333, "percentage": 49.8, "cur_time": "2024-08-21 06:37:24", "elapsed_time": "0:11:05", "remaining_time": "0:11:10", "throughput": "1968.91", "total_tokens": 1309888} +{"current_steps": 500, "total_steps": 1000, "eval_loss": 1.2698992490768433, "epoch": 0.8888888888888888, "percentage": 50.0, "cur_time": "2024-08-21 06:37:49", "elapsed_time": "0:11:31", "remaining_time": "0:11:31", "throughput": "1902.51", "total_tokens": 1314896} +{"current_steps": 501, "total_steps": 1000, "loss": 1.3553, "learning_rate": 5.851047495830163e-05, "epoch": 0.8906666666666667, "percentage": 50.1, "cur_time": "2024-08-21 06:37:51", "elapsed_time": "0:11:32", "remaining_time": "0:11:29", "throughput": "1902.04", "total_tokens": 1317600} +{"current_steps": 504, "total_steps": 1000, "loss": 1.4642, "learning_rate": 5.799405938459175e-05, "epoch": 0.896, "percentage": 50.4, "cur_time": "2024-08-21 06:37:55", "elapsed_time": "0:11:36", "remaining_time": "0:11:25", "throughput": "1903.64", "total_tokens": 1326576} +{"current_steps": 507, "total_steps": 1000, "loss": 1.1863, "learning_rate": 5.747676717218549e-05, "epoch": 0.9013333333333333, "percentage": 50.7, "cur_time": "2024-08-21 06:37:59", "elapsed_time": "0:11:40", "remaining_time": "0:11:21", "throughput": "1902.59", "total_tokens": 1332944} +{"current_steps": 510, "total_steps": 1000, "loss": 1.0854, "learning_rate": 5.695865504800327e-05, "epoch": 0.9066666666666666, "percentage": 51.0, "cur_time": "2024-08-21 06:38:03", "elapsed_time": "0:11:44", "remaining_time": "0:11:16", "throughput": "1902.95", "total_tokens": 1340672} +{"current_steps": 513, "total_steps": 1000, "loss": 1.1361, "learning_rate": 5.643977982887815e-05, "epoch": 0.912, "percentage": 51.3, "cur_time": "2024-08-21 06:38:07", "elapsed_time": "0:11:48", "remaining_time": "0:11:12", "throughput": "1902.29", "total_tokens": 1347408} +{"current_steps": 516, "total_steps": 1000, "loss": 1.5376, "learning_rate": 5.5920198415325064e-05, "epoch": 0.9173333333333333, "percentage": 51.6, "cur_time": "2024-08-21 06:38:11", "elapsed_time": "0:11:52", "remaining_time": "0:11:08", "throughput": "1903.52", "total_tokens": 1356096} +{"current_steps": 519, "total_steps": 1000, "loss": 1.3248, "learning_rate": 5.539996778530115e-05, "epoch": 0.9226666666666666, "percentage": 51.9, "cur_time": "2024-08-21 06:38:15", "elapsed_time": "0:11:56", "remaining_time": "0:11:04", "throughput": "1904.70", "total_tokens": 1364832} +{"current_steps": 522, "total_steps": 1000, "loss": 1.2347, "learning_rate": 5.487914498795747e-05, "epoch": 0.928, "percentage": 52.2, "cur_time": "2024-08-21 06:38:19", "elapsed_time": "0:12:00", "remaining_time": "0:10:59", "throughput": "1903.93", "total_tokens": 1371520} +{"current_steps": 525, "total_steps": 1000, "loss": 1.2789, "learning_rate": 5.435778713738292e-05, "epoch": 0.9333333333333333, "percentage": 52.5, "cur_time": "2024-08-21 06:38:23", "elapsed_time": "0:12:04", "remaining_time": "0:10:55", "throughput": "1906.39", "total_tokens": 1382112} +{"current_steps": 528, "total_steps": 1000, "loss": 1.2137, "learning_rate": 5.383595140634093e-05, "epoch": 0.9386666666666666, "percentage": 52.8, "cur_time": "2024-08-21 06:38:28", "elapsed_time": "0:12:09", "remaining_time": "0:10:51", "throughput": "1907.29", "total_tokens": 1390880} +{"current_steps": 531, "total_steps": 1000, "loss": 1.1884, "learning_rate": 5.3313695020000024e-05, "epoch": 0.944, "percentage": 53.1, "cur_time": "2024-08-21 06:38:32", "elapsed_time": "0:12:13", "remaining_time": "0:10:47", "throughput": "1908.30", "total_tokens": 1399248} +{"current_steps": 534, "total_steps": 1000, "loss": 1.0717, "learning_rate": 5.279107524965819e-05, "epoch": 0.9493333333333334, "percentage": 53.4, "cur_time": "2024-08-21 06:38:36", "elapsed_time": "0:12:17", "remaining_time": "0:10:43", "throughput": "1908.90", "total_tokens": 1407344} +{"current_steps": 537, "total_steps": 1000, "loss": 1.341, "learning_rate": 5.226814940646269e-05, "epoch": 0.9546666666666667, "percentage": 53.7, "cur_time": "2024-08-21 06:38:39", "elapsed_time": "0:12:21", "remaining_time": "0:10:39", "throughput": "1909.00", "total_tokens": 1414864} +{"current_steps": 540, "total_steps": 1000, "loss": 1.2898, "learning_rate": 5.174497483512506e-05, "epoch": 0.96, "percentage": 54.0, "cur_time": "2024-08-21 06:38:44", "elapsed_time": "0:12:25", "remaining_time": "0:10:34", "throughput": "1910.54", "total_tokens": 1424144} +{"current_steps": 543, "total_steps": 1000, "loss": 1.226, "learning_rate": 5.1221608907632665e-05, "epoch": 0.9653333333333334, "percentage": 54.3, "cur_time": "2024-08-21 06:38:48", "elapsed_time": "0:12:29", "remaining_time": "0:10:30", "throughput": "1909.88", "total_tokens": 1431120} +{"current_steps": 546, "total_steps": 1000, "loss": 1.3647, "learning_rate": 5.0698109016957274e-05, "epoch": 0.9706666666666667, "percentage": 54.6, "cur_time": "2024-08-21 06:38:52", "elapsed_time": "0:12:33", "remaining_time": "0:10:26", "throughput": "1910.40", "total_tokens": 1439056} +{"current_steps": 549, "total_steps": 1000, "loss": 1.168, "learning_rate": 5.017453257076119e-05, "epoch": 0.976, "percentage": 54.9, "cur_time": "2024-08-21 06:38:55", "elapsed_time": "0:12:37", "remaining_time": "0:10:22", "throughput": "1911.31", "total_tokens": 1447184} +{"current_steps": 552, "total_steps": 1000, "loss": 1.3466, "learning_rate": 4.965093698510193e-05, "epoch": 0.9813333333333333, "percentage": 55.2, "cur_time": "2024-08-21 06:38:59", "elapsed_time": "0:12:41", "remaining_time": "0:10:17", "throughput": "1911.67", "total_tokens": 1454992} +{"current_steps": 555, "total_steps": 1000, "loss": 1.1918, "learning_rate": 4.912737967813583e-05, "epoch": 0.9866666666666667, "percentage": 55.5, "cur_time": "2024-08-21 06:39:03", "elapsed_time": "0:12:44", "remaining_time": "0:10:13", "throughput": "1911.47", "total_tokens": 1462048} +{"current_steps": 558, "total_steps": 1000, "loss": 1.2577, "learning_rate": 4.860391806382157e-05, "epoch": 0.992, "percentage": 55.8, "cur_time": "2024-08-21 06:39:07", "elapsed_time": "0:12:48", "remaining_time": "0:10:08", "throughput": "1910.94", "total_tokens": 1469024} +{"current_steps": 561, "total_steps": 1000, "loss": 1.2251, "learning_rate": 4.8080609545624004e-05, "epoch": 0.9973333333333333, "percentage": 56.1, "cur_time": "2024-08-21 06:39:11", "elapsed_time": "0:12:52", "remaining_time": "0:10:04", "throughput": "1910.81", "total_tokens": 1476496} +{"current_steps": 564, "total_steps": 1000, "loss": 1.3152, "learning_rate": 4.755751151021934e-05, "epoch": 1.0026666666666666, "percentage": 56.4, "cur_time": "2024-08-21 06:39:15", "elapsed_time": "0:12:56", "remaining_time": "0:10:00", "throughput": "1912.17", "total_tokens": 1485568} +{"current_steps": 567, "total_steps": 1000, "loss": 1.1243, "learning_rate": 4.703468132120193e-05, "epoch": 1.008, "percentage": 56.7, "cur_time": "2024-08-21 06:39:19", "elapsed_time": "0:13:00", "remaining_time": "0:09:56", "throughput": "1911.63", "total_tokens": 1492544} +{"current_steps": 570, "total_steps": 1000, "loss": 1.3484, "learning_rate": 4.6512176312793736e-05, "epoch": 1.0133333333333334, "percentage": 57.0, "cur_time": "2024-08-21 06:39:23", "elapsed_time": "0:13:05", "remaining_time": "0:09:52", "throughput": "1912.29", "total_tokens": 1501216} +{"current_steps": 573, "total_steps": 1000, "loss": 1.1443, "learning_rate": 4.599005378355706e-05, "epoch": 1.0186666666666666, "percentage": 57.3, "cur_time": "2024-08-21 06:39:27", "elapsed_time": "0:13:09", "remaining_time": "0:09:48", "throughput": "1913.24", "total_tokens": 1509824} +{"current_steps": 576, "total_steps": 1000, "loss": 1.1144, "learning_rate": 4.5468370990111006e-05, "epoch": 1.024, "percentage": 57.6, "cur_time": "2024-08-21 06:39:31", "elapsed_time": "0:13:12", "remaining_time": "0:09:43", "throughput": "1912.93", "total_tokens": 1516736} +{"current_steps": 579, "total_steps": 1000, "loss": 1.4006, "learning_rate": 4.494718514085268e-05, "epoch": 1.0293333333333334, "percentage": 57.9, "cur_time": "2024-08-21 06:39:35", "elapsed_time": "0:13:16", "remaining_time": "0:09:39", "throughput": "1913.06", "total_tokens": 1524208} +{"current_steps": 582, "total_steps": 1000, "loss": 1.2874, "learning_rate": 4.442655338968373e-05, "epoch": 1.0346666666666666, "percentage": 58.2, "cur_time": "2024-08-21 06:39:39", "elapsed_time": "0:13:21", "remaining_time": "0:09:35", "throughput": "1915.80", "total_tokens": 1534656} +{"current_steps": 585, "total_steps": 1000, "loss": 1.0906, "learning_rate": 4.390653282974264e-05, "epoch": 1.04, "percentage": 58.5, "cur_time": "2024-08-21 06:39:43", "elapsed_time": "0:13:24", "remaining_time": "0:09:30", "throughput": "1914.90", "total_tokens": 1541136} +{"current_steps": 588, "total_steps": 1000, "loss": 1.1907, "learning_rate": 4.3387180487143876e-05, "epoch": 1.0453333333333332, "percentage": 58.8, "cur_time": "2024-08-21 06:39:47", "elapsed_time": "0:13:29", "remaining_time": "0:09:26", "throughput": "1916.58", "total_tokens": 1550640} +{"current_steps": 591, "total_steps": 1000, "loss": 1.2323, "learning_rate": 4.2868553314724425e-05, "epoch": 1.0506666666666666, "percentage": 59.1, "cur_time": "2024-08-21 06:39:51", "elapsed_time": "0:13:32", "remaining_time": "0:09:22", "throughput": "1915.74", "total_tokens": 1557488} +{"current_steps": 594, "total_steps": 1000, "loss": 1.3508, "learning_rate": 4.23507081857981e-05, "epoch": 1.056, "percentage": 59.4, "cur_time": "2024-08-21 06:39:55", "elapsed_time": "0:13:36", "remaining_time": "0:09:18", "throughput": "1916.32", "total_tokens": 1565392} +{"current_steps": 597, "total_steps": 1000, "loss": 1.3322, "learning_rate": 4.1833701887918904e-05, "epoch": 1.0613333333333332, "percentage": 59.7, "cur_time": "2024-08-21 06:39:59", "elapsed_time": "0:13:40", "remaining_time": "0:09:14", "throughput": "1917.59", "total_tokens": 1574080} +{"current_steps": 600, "total_steps": 1000, "loss": 1.0924, "learning_rate": 4.131759111665349e-05, "epoch": 1.0666666666666667, "percentage": 60.0, "cur_time": "2024-08-21 06:40:03", "elapsed_time": "0:13:44", "remaining_time": "0:09:09", "throughput": "1917.37", "total_tokens": 1581312} +{"current_steps": 603, "total_steps": 1000, "loss": 1.2264, "learning_rate": 4.080243246936399e-05, "epoch": 1.072, "percentage": 60.3, "cur_time": "2024-08-21 06:40:07", "elapsed_time": "0:13:48", "remaining_time": "0:09:05", "throughput": "1916.28", "total_tokens": 1588096} +{"current_steps": 606, "total_steps": 1000, "loss": 1.05, "learning_rate": 4.028828243900141e-05, "epoch": 1.0773333333333333, "percentage": 60.6, "cur_time": "2024-08-21 06:40:11", "elapsed_time": "0:13:52", "remaining_time": "0:09:01", "throughput": "1915.46", "total_tokens": 1594736} +{"current_steps": 609, "total_steps": 1000, "loss": 1.3512, "learning_rate": 3.9775197407910485e-05, "epoch": 1.0826666666666667, "percentage": 60.9, "cur_time": "2024-08-21 06:40:15", "elapsed_time": "0:13:56", "remaining_time": "0:08:57", "throughput": "1916.34", "total_tokens": 1603312} +{"current_steps": 612, "total_steps": 1000, "loss": 1.281, "learning_rate": 3.926323364164684e-05, "epoch": 1.088, "percentage": 61.2, "cur_time": "2024-08-21 06:40:19", "elapsed_time": "0:14:01", "remaining_time": "0:08:53", "throughput": "1918.14", "total_tokens": 1613232} +{"current_steps": 615, "total_steps": 1000, "loss": 1.3661, "learning_rate": 3.875244728280676e-05, "epoch": 1.0933333333333333, "percentage": 61.5, "cur_time": "2024-08-21 06:40:23", "elapsed_time": "0:14:05", "remaining_time": "0:08:49", "throughput": "1919.13", "total_tokens": 1621936} +{"current_steps": 618, "total_steps": 1000, "loss": 1.1989, "learning_rate": 3.82428943448705e-05, "epoch": 1.0986666666666667, "percentage": 61.8, "cur_time": "2024-08-21 06:40:27", "elapsed_time": "0:14:09", "remaining_time": "0:08:44", "throughput": "1919.68", "total_tokens": 1629984} +{"current_steps": 621, "total_steps": 1000, "loss": 1.0763, "learning_rate": 3.773463070605987e-05, "epoch": 1.104, "percentage": 62.1, "cur_time": "2024-08-21 06:40:31", "elapsed_time": "0:14:12", "remaining_time": "0:08:40", "throughput": "1918.61", "total_tokens": 1636016} +{"current_steps": 624, "total_steps": 1000, "loss": 1.1786, "learning_rate": 3.7227712103210486e-05, "epoch": 1.1093333333333333, "percentage": 62.4, "cur_time": "2024-08-21 06:40:35", "elapsed_time": "0:14:16", "remaining_time": "0:08:36", "throughput": "1918.02", "total_tokens": 1642832} +{"current_steps": 627, "total_steps": 1000, "loss": 1.0718, "learning_rate": 3.6722194125659556e-05, "epoch": 1.1146666666666667, "percentage": 62.7, "cur_time": "2024-08-21 06:40:39", "elapsed_time": "0:14:20", "remaining_time": "0:08:31", "throughput": "1916.58", "total_tokens": 1648752} +{"current_steps": 630, "total_steps": 1000, "loss": 1.3182, "learning_rate": 3.6218132209150045e-05, "epoch": 1.12, "percentage": 63.0, "cur_time": "2024-08-21 06:40:43", "elapsed_time": "0:14:24", "remaining_time": "0:08:27", "throughput": "1917.62", "total_tokens": 1657488} +{"current_steps": 633, "total_steps": 1000, "loss": 1.3274, "learning_rate": 3.5715581629751326e-05, "epoch": 1.1253333333333333, "percentage": 63.3, "cur_time": "2024-08-21 06:40:47", "elapsed_time": "0:14:28", "remaining_time": "0:08:23", "throughput": "1918.90", "total_tokens": 1666720} +{"current_steps": 636, "total_steps": 1000, "loss": 1.3109, "learning_rate": 3.5214597497797684e-05, "epoch": 1.1306666666666667, "percentage": 63.6, "cur_time": "2024-08-21 06:40:51", "elapsed_time": "0:14:33", "remaining_time": "0:08:19", "throughput": "1921.85", "total_tokens": 1678048} +{"current_steps": 639, "total_steps": 1000, "loss": 1.287, "learning_rate": 3.471523475184472e-05, "epoch": 1.1360000000000001, "percentage": 63.9, "cur_time": "2024-08-21 06:40:56", "elapsed_time": "0:14:37", "remaining_time": "0:08:15", "throughput": "1923.18", "total_tokens": 1687120} +{"current_steps": 642, "total_steps": 1000, "loss": 1.2379, "learning_rate": 3.4217548152644885e-05, "epoch": 1.1413333333333333, "percentage": 64.2, "cur_time": "2024-08-21 06:41:00", "elapsed_time": "0:14:41", "remaining_time": "0:08:11", "throughput": "1923.58", "total_tokens": 1695232} +{"current_steps": 645, "total_steps": 1000, "loss": 1.2971, "learning_rate": 3.372159227714218e-05, "epoch": 1.1466666666666667, "percentage": 64.5, "cur_time": "2024-08-21 06:41:04", "elapsed_time": "0:14:45", "remaining_time": "0:08:07", "throughput": "1924.12", "total_tokens": 1703424} +{"current_steps": 648, "total_steps": 1000, "loss": 1.1837, "learning_rate": 3.322742151248725e-05, "epoch": 1.152, "percentage": 64.8, "cur_time": "2024-08-21 06:41:08", "elapsed_time": "0:14:49", "remaining_time": "0:08:03", "throughput": "1924.32", "total_tokens": 1711504} +{"current_steps": 651, "total_steps": 1000, "loss": 1.1607, "learning_rate": 3.273509005007327e-05, "epoch": 1.1573333333333333, "percentage": 65.1, "cur_time": "2024-08-21 06:41:12", "elapsed_time": "0:14:53", "remaining_time": "0:07:59", "throughput": "1924.58", "total_tokens": 1720016} +{"current_steps": 654, "total_steps": 1000, "loss": 1.1984, "learning_rate": 3.224465187959316e-05, "epoch": 1.1626666666666667, "percentage": 65.4, "cur_time": "2024-08-21 06:41:16", "elapsed_time": "0:14:57", "remaining_time": "0:07:54", "throughput": "1923.02", "total_tokens": 1725952} +{"current_steps": 657, "total_steps": 1000, "loss": 1.2528, "learning_rate": 3.1756160783119016e-05, "epoch": 1.168, "percentage": 65.7, "cur_time": "2024-08-21 06:41:20", "elapsed_time": "0:15:02", "remaining_time": "0:07:50", "throughput": "1924.28", "total_tokens": 1735728} +{"current_steps": 660, "total_steps": 1000, "loss": 1.2614, "learning_rate": 3.12696703292044e-05, "epoch": 1.1733333333333333, "percentage": 66.0, "cur_time": "2024-08-21 06:41:24", "elapsed_time": "0:15:05", "remaining_time": "0:07:46", "throughput": "1923.34", "total_tokens": 1742160} +{"current_steps": 663, "total_steps": 1000, "loss": 1.2315, "learning_rate": 3.078523386700982e-05, "epoch": 1.1786666666666668, "percentage": 66.3, "cur_time": "2024-08-21 06:41:28", "elapsed_time": "0:15:09", "remaining_time": "0:07:42", "throughput": "1922.45", "total_tokens": 1748592} +{"current_steps": 666, "total_steps": 1000, "loss": 1.1794, "learning_rate": 3.0302904520452447e-05, "epoch": 1.184, "percentage": 66.6, "cur_time": "2024-08-21 06:41:32", "elapsed_time": "0:15:13", "remaining_time": "0:07:38", "throughput": "1922.70", "total_tokens": 1756800} +{"current_steps": 669, "total_steps": 1000, "loss": 1.205, "learning_rate": 2.9822735182380496e-05, "epoch": 1.1893333333333334, "percentage": 66.9, "cur_time": "2024-08-21 06:41:36", "elapsed_time": "0:15:17", "remaining_time": "0:07:34", "throughput": "1922.93", "total_tokens": 1764624} +{"current_steps": 672, "total_steps": 1000, "loss": 1.2932, "learning_rate": 2.934477850877292e-05, "epoch": 1.1946666666666665, "percentage": 67.2, "cur_time": "2024-08-21 06:41:40", "elapsed_time": "0:15:21", "remaining_time": "0:07:29", "throughput": "1923.25", "total_tokens": 1772688} +{"current_steps": 675, "total_steps": 1000, "loss": 1.2259, "learning_rate": 2.886908691296504e-05, "epoch": 1.2, "percentage": 67.5, "cur_time": "2024-08-21 06:41:44", "elapsed_time": "0:15:25", "remaining_time": "0:07:25", "throughput": "1923.18", "total_tokens": 1780128} +{"current_steps": 678, "total_steps": 1000, "loss": 1.2725, "learning_rate": 2.8395712559900877e-05, "epoch": 1.2053333333333334, "percentage": 67.8, "cur_time": "2024-08-21 06:41:48", "elapsed_time": "0:15:29", "remaining_time": "0:07:21", "throughput": "1923.85", "total_tokens": 1788160} +{"current_steps": 681, "total_steps": 1000, "loss": 1.0984, "learning_rate": 2.7924707360412746e-05, "epoch": 1.2106666666666666, "percentage": 68.1, "cur_time": "2024-08-21 06:41:52", "elapsed_time": "0:15:33", "remaining_time": "0:07:17", "throughput": "1922.62", "total_tokens": 1794336} +{"current_steps": 684, "total_steps": 1000, "loss": 1.3735, "learning_rate": 2.7456122965528475e-05, "epoch": 1.216, "percentage": 68.4, "cur_time": "2024-08-21 06:41:56", "elapsed_time": "0:15:37", "remaining_time": "0:07:13", "throughput": "1921.84", "total_tokens": 1801280} +{"current_steps": 687, "total_steps": 1000, "loss": 1.2211, "learning_rate": 2.699001076080742e-05, "epoch": 1.2213333333333334, "percentage": 68.7, "cur_time": "2024-08-21 06:42:00", "elapsed_time": "0:15:41", "remaining_time": "0:07:08", "throughput": "1923.49", "total_tokens": 1811088} +{"current_steps": 690, "total_steps": 1000, "loss": 1.123, "learning_rate": 2.6526421860705473e-05, "epoch": 1.2266666666666666, "percentage": 69.0, "cur_time": "2024-08-21 06:42:03", "elapsed_time": "0:15:45", "remaining_time": "0:07:04", "throughput": "1922.21", "total_tokens": 1816848} +{"current_steps": 693, "total_steps": 1000, "loss": 1.2838, "learning_rate": 2.6065407102969664e-05, "epoch": 1.232, "percentage": 69.3, "cur_time": "2024-08-21 06:42:08", "elapsed_time": "0:15:49", "remaining_time": "0:07:00", "throughput": "1922.49", "total_tokens": 1824992} +{"current_steps": 696, "total_steps": 1000, "loss": 1.1937, "learning_rate": 2.560701704306336e-05, "epoch": 1.2373333333333334, "percentage": 69.6, "cur_time": "2024-08-21 06:42:11", "elapsed_time": "0:15:53", "remaining_time": "0:06:56", "throughput": "1921.98", "total_tokens": 1831712} +{"current_steps": 699, "total_steps": 1000, "loss": 1.2453, "learning_rate": 2.5151301948622237e-05, "epoch": 1.2426666666666666, "percentage": 69.9, "cur_time": "2024-08-21 06:42:16", "elapsed_time": "0:15:57", "remaining_time": "0:06:52", "throughput": "1924.11", "total_tokens": 1842272} +{"current_steps": 702, "total_steps": 1000, "loss": 1.1175, "learning_rate": 2.469831179394182e-05, "epoch": 1.248, "percentage": 70.2, "cur_time": "2024-08-21 06:42:20", "elapsed_time": "0:16:01", "remaining_time": "0:06:48", "throughput": "1921.85", "total_tokens": 1847776} +{"current_steps": 705, "total_steps": 1000, "loss": 1.2671, "learning_rate": 2.4248096254497288e-05, "epoch": 1.2533333333333334, "percentage": 70.5, "cur_time": "2024-08-21 06:42:24", "elapsed_time": "0:16:05", "remaining_time": "0:06:44", "throughput": "1922.92", "total_tokens": 1856992} +{"current_steps": 708, "total_steps": 1000, "loss": 1.2001, "learning_rate": 2.3800704701496053e-05, "epoch": 1.2586666666666666, "percentage": 70.8, "cur_time": "2024-08-21 06:42:28", "elapsed_time": "0:16:09", "remaining_time": "0:06:39", "throughput": "1922.73", "total_tokens": 1864448} +{"current_steps": 711, "total_steps": 1000, "loss": 1.2348, "learning_rate": 2.33561861964635e-05, "epoch": 1.264, "percentage": 71.1, "cur_time": "2024-08-21 06:42:32", "elapsed_time": "0:16:13", "remaining_time": "0:06:35", "throughput": "1924.17", "total_tokens": 1873680} +{"current_steps": 714, "total_steps": 1000, "loss": 1.1127, "learning_rate": 2.2914589485863014e-05, "epoch": 1.2693333333333334, "percentage": 71.4, "cur_time": "2024-08-21 06:42:36", "elapsed_time": "0:16:17", "remaining_time": "0:06:31", "throughput": "1924.89", "total_tokens": 1881744} +{"current_steps": 717, "total_steps": 1000, "loss": 1.1406, "learning_rate": 2.247596299575022e-05, "epoch": 1.2746666666666666, "percentage": 71.7, "cur_time": "2024-08-21 06:42:40", "elapsed_time": "0:16:21", "remaining_time": "0:06:27", "throughput": "1925.43", "total_tokens": 1889808} +{"current_steps": 720, "total_steps": 1000, "loss": 1.1346, "learning_rate": 2.2040354826462668e-05, "epoch": 1.28, "percentage": 72.0, "cur_time": "2024-08-21 06:42:44", "elapsed_time": "0:16:25", "remaining_time": "0:06:23", "throughput": "1924.76", "total_tokens": 1896304} +{"current_steps": 723, "total_steps": 1000, "loss": 1.2673, "learning_rate": 2.160781274734495e-05, "epoch": 1.2853333333333334, "percentage": 72.3, "cur_time": "2024-08-21 06:42:48", "elapsed_time": "0:16:29", "remaining_time": "0:06:19", "throughput": "1926.34", "total_tokens": 1906672} +{"current_steps": 726, "total_steps": 1000, "loss": 1.1793, "learning_rate": 2.117838419151034e-05, "epoch": 1.2906666666666666, "percentage": 72.6, "cur_time": "2024-08-21 06:42:52", "elapsed_time": "0:16:33", "remaining_time": "0:06:15", "throughput": "1925.74", "total_tokens": 1913616} +{"current_steps": 729, "total_steps": 1000, "loss": 1.1043, "learning_rate": 2.0752116250639225e-05, "epoch": 1.296, "percentage": 72.9, "cur_time": "2024-08-21 06:42:56", "elapsed_time": "0:16:37", "remaining_time": "0:06:10", "throughput": "1925.62", "total_tokens": 1920944} +{"current_steps": 732, "total_steps": 1000, "loss": 1.2762, "learning_rate": 2.0329055669814934e-05, "epoch": 1.3013333333333335, "percentage": 73.2, "cur_time": "2024-08-21 06:43:00", "elapsed_time": "0:16:41", "remaining_time": "0:06:06", "throughput": "1925.99", "total_tokens": 1929216} +{"current_steps": 735, "total_steps": 1000, "loss": 1.02, "learning_rate": 1.9909248842397584e-05, "epoch": 1.3066666666666666, "percentage": 73.5, "cur_time": "2024-08-21 06:43:04", "elapsed_time": "0:16:45", "remaining_time": "0:06:02", "throughput": "1925.37", "total_tokens": 1935792} +{"current_steps": 738, "total_steps": 1000, "loss": 1.0995, "learning_rate": 1.9492741804936622e-05, "epoch": 1.312, "percentage": 73.8, "cur_time": "2024-08-21 06:43:08", "elapsed_time": "0:16:49", "remaining_time": "0:05:58", "throughput": "1924.90", "total_tokens": 1942752} +{"current_steps": 741, "total_steps": 1000, "loss": 1.2871, "learning_rate": 1.9079580232122303e-05, "epoch": 1.3173333333333335, "percentage": 74.1, "cur_time": "2024-08-21 06:43:12", "elapsed_time": "0:16:53", "remaining_time": "0:05:54", "throughput": "1924.83", "total_tokens": 1950528} +{"current_steps": 744, "total_steps": 1000, "loss": 1.171, "learning_rate": 1.866980943177699e-05, "epoch": 1.3226666666666667, "percentage": 74.4, "cur_time": "2024-08-21 06:43:15", "elapsed_time": "0:16:57", "remaining_time": "0:05:49", "throughput": "1924.40", "total_tokens": 1957392} +{"current_steps": 747, "total_steps": 1000, "loss": 1.2077, "learning_rate": 1.8263474339886628e-05, "epoch": 1.328, "percentage": 74.7, "cur_time": "2024-08-21 06:43:20", "elapsed_time": "0:17:01", "remaining_time": "0:05:45", "throughput": "1925.57", "total_tokens": 1966480} +{"current_steps": 750, "total_steps": 1000, "loss": 1.0823, "learning_rate": 1.7860619515673033e-05, "epoch": 1.3333333333333333, "percentage": 75.0, "cur_time": "2024-08-21 06:43:24", "elapsed_time": "0:17:05", "remaining_time": "0:05:41", "throughput": "1925.19", "total_tokens": 1973968} +{"current_steps": 753, "total_steps": 1000, "loss": 1.1719, "learning_rate": 1.746128913670746e-05, "epoch": 1.3386666666666667, "percentage": 75.3, "cur_time": "2024-08-21 06:43:27", "elapsed_time": "0:17:09", "remaining_time": "0:05:37", "throughput": "1924.51", "total_tokens": 1980416} +{"current_steps": 756, "total_steps": 1000, "loss": 1.1616, "learning_rate": 1.7065526994065973e-05, "epoch": 1.3439999999999999, "percentage": 75.6, "cur_time": "2024-08-21 06:43:31", "elapsed_time": "0:17:12", "remaining_time": "0:05:33", "throughput": "1924.33", "total_tokens": 1987808} +{"current_steps": 759, "total_steps": 1000, "loss": 1.2321, "learning_rate": 1.667337648752738e-05, "epoch": 1.3493333333333333, "percentage": 75.9, "cur_time": "2024-08-21 06:43:35", "elapsed_time": "0:17:16", "remaining_time": "0:05:29", "throughput": "1924.57", "total_tokens": 1995728} +{"current_steps": 762, "total_steps": 1000, "loss": 1.4247, "learning_rate": 1.6284880620813848e-05, "epoch": 1.3546666666666667, "percentage": 76.2, "cur_time": "2024-08-21 06:43:39", "elapsed_time": "0:17:20", "remaining_time": "0:05:25", "throughput": "1925.32", "total_tokens": 2004016} +{"current_steps": 765, "total_steps": 1000, "loss": 1.1731, "learning_rate": 1.5900081996875083e-05, "epoch": 1.3599999999999999, "percentage": 76.5, "cur_time": "2024-08-21 06:43:43", "elapsed_time": "0:17:24", "remaining_time": "0:05:20", "throughput": "1925.94", "total_tokens": 2012384} +{"current_steps": 768, "total_steps": 1000, "loss": 1.0719, "learning_rate": 1.551902281321651e-05, "epoch": 1.3653333333333333, "percentage": 76.8, "cur_time": "2024-08-21 06:43:47", "elapsed_time": "0:17:28", "remaining_time": "0:05:16", "throughput": "1926.15", "total_tokens": 2020096} +{"current_steps": 771, "total_steps": 1000, "loss": 1.119, "learning_rate": 1.5141744857271778e-05, "epoch": 1.3706666666666667, "percentage": 77.1, "cur_time": "2024-08-21 06:43:51", "elapsed_time": "0:17:32", "remaining_time": "0:05:12", "throughput": "1925.69", "total_tokens": 2027056} +{"current_steps": 774, "total_steps": 1000, "loss": 1.2802, "learning_rate": 1.4768289501820265e-05, "epoch": 1.376, "percentage": 77.4, "cur_time": "2024-08-21 06:43:55", "elapsed_time": "0:17:36", "remaining_time": "0:05:08", "throughput": "1925.43", "total_tokens": 2034256} +{"current_steps": 777, "total_steps": 1000, "loss": 1.172, "learning_rate": 1.439869770045018e-05, "epoch": 1.3813333333333333, "percentage": 77.7, "cur_time": "2024-08-21 06:43:59", "elapsed_time": "0:17:40", "remaining_time": "0:05:04", "throughput": "1924.79", "total_tokens": 2040768} +{"current_steps": 780, "total_steps": 1000, "loss": 1.2637, "learning_rate": 1.4033009983067452e-05, "epoch": 1.3866666666666667, "percentage": 78.0, "cur_time": "2024-08-21 06:44:02", "elapsed_time": "0:17:44", "remaining_time": "0:05:00", "throughput": "1924.36", "total_tokens": 2047648} +{"current_steps": 783, "total_steps": 1000, "loss": 1.1766, "learning_rate": 1.367126645145121e-05, "epoch": 1.392, "percentage": 78.3, "cur_time": "2024-08-21 06:44:06", "elapsed_time": "0:17:47", "remaining_time": "0:04:55", "throughput": "1924.05", "total_tokens": 2054848} +{"current_steps": 786, "total_steps": 1000, "loss": 1.3347, "learning_rate": 1.3313506774856177e-05, "epoch": 1.3973333333333333, "percentage": 78.6, "cur_time": "2024-08-21 06:44:10", "elapsed_time": "0:17:52", "remaining_time": "0:04:51", "throughput": "1925.05", "total_tokens": 2063744} +{"current_steps": 789, "total_steps": 1000, "loss": 1.2374, "learning_rate": 1.29597701856625e-05, "epoch": 1.4026666666666667, "percentage": 78.9, "cur_time": "2024-08-21 06:44:14", "elapsed_time": "0:17:56", "remaining_time": "0:04:47", "throughput": "1925.95", "total_tokens": 2072336} +{"current_steps": 792, "total_steps": 1000, "loss": 1.303, "learning_rate": 1.2610095475073414e-05, "epoch": 1.408, "percentage": 79.2, "cur_time": "2024-08-21 06:44:18", "elapsed_time": "0:17:59", "remaining_time": "0:04:43", "throughput": "1926.87", "total_tokens": 2080992} +{"current_steps": 795, "total_steps": 1000, "loss": 1.0784, "learning_rate": 1.22645209888614e-05, "epoch": 1.4133333333333333, "percentage": 79.5, "cur_time": "2024-08-21 06:44:22", "elapsed_time": "0:18:03", "remaining_time": "0:04:39", "throughput": "1926.30", "total_tokens": 2087728} +{"current_steps": 798, "total_steps": 1000, "loss": 1.2539, "learning_rate": 1.1923084623163172e-05, "epoch": 1.4186666666666667, "percentage": 79.8, "cur_time": "2024-08-21 06:44:26", "elapsed_time": "0:18:07", "remaining_time": "0:04:35", "throughput": "1926.75", "total_tokens": 2095968} +{"current_steps": 801, "total_steps": 1000, "loss": 1.2818, "learning_rate": 1.1585823820323843e-05, "epoch": 1.424, "percentage": 80.1, "cur_time": "2024-08-21 06:44:30", "elapsed_time": "0:18:11", "remaining_time": "0:04:31", "throughput": "1926.15", "total_tokens": 2103088} +{"current_steps": 804, "total_steps": 1000, "loss": 1.143, "learning_rate": 1.1252775564791024e-05, "epoch": 1.4293333333333333, "percentage": 80.4, "cur_time": "2024-08-21 06:44:34", "elapsed_time": "0:18:15", "remaining_time": "0:04:27", "throughput": "1926.54", "total_tokens": 2111408} +{"current_steps": 807, "total_steps": 1000, "loss": 1.233, "learning_rate": 1.0923976379059058e-05, "epoch": 1.4346666666666668, "percentage": 80.7, "cur_time": "2024-08-21 06:44:38", "elapsed_time": "0:18:19", "remaining_time": "0:04:23", "throughput": "1927.12", "total_tokens": 2119632} +{"current_steps": 810, "total_steps": 1000, "loss": 0.9924, "learning_rate": 1.0599462319663905e-05, "epoch": 1.44, "percentage": 81.0, "cur_time": "2024-08-21 06:44:42", "elapsed_time": "0:18:23", "remaining_time": "0:04:18", "throughput": "1927.10", "total_tokens": 2127440} +{"current_steps": 813, "total_steps": 1000, "loss": 1.2331, "learning_rate": 1.0279268973229089e-05, "epoch": 1.4453333333333334, "percentage": 81.3, "cur_time": "2024-08-21 06:44:47", "elapsed_time": "0:18:28", "remaining_time": "0:04:14", "throughput": "1928.29", "total_tokens": 2137088} +{"current_steps": 816, "total_steps": 1000, "loss": 1.196, "learning_rate": 9.963431452563332e-06, "epoch": 1.4506666666666668, "percentage": 81.6, "cur_time": "2024-08-21 06:44:51", "elapsed_time": "0:18:32", "remaining_time": "0:04:10", "throughput": "1928.49", "total_tokens": 2144992} +{"current_steps": 819, "total_steps": 1000, "loss": 1.0854, "learning_rate": 9.651984392809914e-06, "epoch": 1.456, "percentage": 81.9, "cur_time": "2024-08-21 06:44:55", "elapsed_time": "0:18:36", "remaining_time": "0:04:06", "throughput": "1929.11", "total_tokens": 2153936} +{"current_steps": 822, "total_steps": 1000, "loss": 1.1659, "learning_rate": 9.344961947648623e-06, "epoch": 1.4613333333333334, "percentage": 82.2, "cur_time": "2024-08-21 06:44:59", "elapsed_time": "0:18:40", "remaining_time": "0:04:02", "throughput": "1929.15", "total_tokens": 2161760} +{"current_steps": 825, "total_steps": 1000, "loss": 1.1839, "learning_rate": 9.042397785550405e-06, "epoch": 1.4666666666666668, "percentage": 82.5, "cur_time": "2024-08-21 06:45:03", "elapsed_time": "0:18:44", "remaining_time": "0:03:58", "throughput": "1928.83", "total_tokens": 2168736} +{"current_steps": 828, "total_steps": 1000, "loss": 1.2836, "learning_rate": 8.744325086085248e-06, "epoch": 1.472, "percentage": 82.8, "cur_time": "2024-08-21 06:45:06", "elapsed_time": "0:18:48", "remaining_time": "0:03:54", "throughput": "1928.54", "total_tokens": 2175712} +{"current_steps": 831, "total_steps": 1000, "loss": 1.1372, "learning_rate": 8.450776536283594e-06, "epoch": 1.4773333333333334, "percentage": 83.1, "cur_time": "2024-08-21 06:45:10", "elapsed_time": "0:18:52", "remaining_time": "0:03:50", "throughput": "1928.29", "total_tokens": 2182960} +{"current_steps": 834, "total_steps": 1000, "loss": 1.2549, "learning_rate": 8.16178432705192e-06, "epoch": 1.4826666666666668, "percentage": 83.4, "cur_time": "2024-08-21 06:45:14", "elapsed_time": "0:18:56", "remaining_time": "0:03:46", "throughput": "1928.69", "total_tokens": 2191232} +{"current_steps": 837, "total_steps": 1000, "loss": 1.1984, "learning_rate": 7.877380149642626e-06, "epoch": 1.488, "percentage": 83.7, "cur_time": "2024-08-21 06:45:18", "elapsed_time": "0:19:00", "remaining_time": "0:03:42", "throughput": "1929.56", "total_tokens": 2200064} +{"current_steps": 840, "total_steps": 1000, "loss": 1.4679, "learning_rate": 7.597595192178702e-06, "epoch": 1.4933333333333334, "percentage": 84.0, "cur_time": "2024-08-21 06:45:23", "elapsed_time": "0:19:04", "remaining_time": "0:03:37", "throughput": "1930.42", "total_tokens": 2209072} +{"current_steps": 843, "total_steps": 1000, "loss": 1.3558, "learning_rate": 7.322460136233622e-06, "epoch": 1.4986666666666666, "percentage": 84.3, "cur_time": "2024-08-21 06:45:26", "elapsed_time": "0:19:08", "remaining_time": "0:03:33", "throughput": "1930.72", "total_tokens": 2216608} +{"current_steps": 846, "total_steps": 1000, "loss": 1.1688, "learning_rate": 7.052005153466779e-06, "epoch": 1.504, "percentage": 84.6, "cur_time": "2024-08-21 06:45:30", "elapsed_time": "0:19:12", "remaining_time": "0:03:29", "throughput": "1931.13", "total_tokens": 2224784} +{"current_steps": 849, "total_steps": 1000, "loss": 1.0322, "learning_rate": 6.786259902314768e-06, "epoch": 1.5093333333333332, "percentage": 84.9, "cur_time": "2024-08-21 06:45:34", "elapsed_time": "0:19:16", "remaining_time": "0:03:25", "throughput": "1931.30", "total_tokens": 2232640} +{"current_steps": 852, "total_steps": 1000, "loss": 1.1789, "learning_rate": 6.52525352473905e-06, "epoch": 1.5146666666666668, "percentage": 85.2, "cur_time": "2024-08-21 06:45:38", "elapsed_time": "0:19:20", "remaining_time": "0:03:21", "throughput": "1931.87", "total_tokens": 2241184} +{"current_steps": 855, "total_steps": 1000, "loss": 1.2104, "learning_rate": 6.269014643030213e-06, "epoch": 1.52, "percentage": 85.5, "cur_time": "2024-08-21 06:45:42", "elapsed_time": "0:19:24", "remaining_time": "0:03:17", "throughput": "1931.89", "total_tokens": 2248848} +{"current_steps": 858, "total_steps": 1000, "loss": 1.2657, "learning_rate": 6.017571356669183e-06, "epoch": 1.5253333333333332, "percentage": 85.8, "cur_time": "2024-08-21 06:45:46", "elapsed_time": "0:19:28", "remaining_time": "0:03:13", "throughput": "1932.81", "total_tokens": 2257808} +{"current_steps": 861, "total_steps": 1000, "loss": 1.1423, "learning_rate": 5.770951239245803e-06, "epoch": 1.5306666666666666, "percentage": 86.1, "cur_time": "2024-08-21 06:45:50", "elapsed_time": "0:19:31", "remaining_time": "0:03:09", "throughput": "1932.63", "total_tokens": 2264848} +{"current_steps": 864, "total_steps": 1000, "loss": 1.0841, "learning_rate": 5.529181335435124e-06, "epoch": 1.536, "percentage": 86.4, "cur_time": "2024-08-21 06:45:54", "elapsed_time": "0:19:35", "remaining_time": "0:03:05", "throughput": "1932.04", "total_tokens": 2271568} +{"current_steps": 867, "total_steps": 1000, "loss": 1.2021, "learning_rate": 5.292288158031594e-06, "epoch": 1.5413333333333332, "percentage": 86.7, "cur_time": "2024-08-21 06:45:58", "elapsed_time": "0:19:39", "remaining_time": "0:03:00", "throughput": "1931.64", "total_tokens": 2278592} +{"current_steps": 870, "total_steps": 1000, "loss": 1.0631, "learning_rate": 5.060297685041659e-06, "epoch": 1.5466666666666666, "percentage": 87.0, "cur_time": "2024-08-21 06:46:02", "elapsed_time": "0:19:43", "remaining_time": "0:02:56", "throughput": "1931.66", "total_tokens": 2286304} +{"current_steps": 873, "total_steps": 1000, "loss": 1.1668, "learning_rate": 4.833235356834959e-06, "epoch": 1.552, "percentage": 87.3, "cur_time": "2024-08-21 06:46:06", "elapsed_time": "0:19:47", "remaining_time": "0:02:52", "throughput": "1932.45", "total_tokens": 2295040} +{"current_steps": 876, "total_steps": 1000, "loss": 1.2205, "learning_rate": 4.611126073354571e-06, "epoch": 1.5573333333333332, "percentage": 87.6, "cur_time": "2024-08-21 06:46:10", "elapsed_time": "0:19:51", "remaining_time": "0:02:48", "throughput": "1932.98", "total_tokens": 2304000} +{"current_steps": 879, "total_steps": 1000, "loss": 1.1882, "learning_rate": 4.3939941913863525e-06, "epoch": 1.5626666666666666, "percentage": 87.9, "cur_time": "2024-08-21 06:46:14", "elapsed_time": "0:19:55", "remaining_time": "0:02:44", "throughput": "1932.36", "total_tokens": 2310384} +{"current_steps": 882, "total_steps": 1000, "loss": 1.2363, "learning_rate": 4.181863521888019e-06, "epoch": 1.568, "percentage": 88.2, "cur_time": "2024-08-21 06:46:18", "elapsed_time": "0:19:59", "remaining_time": "0:02:40", "throughput": "1931.96", "total_tokens": 2317008} +{"current_steps": 885, "total_steps": 1000, "loss": 1.1589, "learning_rate": 3.974757327377981e-06, "epoch": 1.5733333333333333, "percentage": 88.5, "cur_time": "2024-08-21 06:46:22", "elapsed_time": "0:20:03", "remaining_time": "0:02:36", "throughput": "1931.96", "total_tokens": 2324752} +{"current_steps": 888, "total_steps": 1000, "loss": 1.0814, "learning_rate": 3.772698319384349e-06, "epoch": 1.5786666666666667, "percentage": 88.8, "cur_time": "2024-08-21 06:46:26", "elapsed_time": "0:20:07", "remaining_time": "0:02:32", "throughput": "1931.95", "total_tokens": 2332416} +{"current_steps": 891, "total_steps": 1000, "loss": 1.2078, "learning_rate": 3.575708655954324e-06, "epoch": 1.584, "percentage": 89.1, "cur_time": "2024-08-21 06:46:30", "elapsed_time": "0:20:11", "remaining_time": "0:02:28", "throughput": "1932.86", "total_tokens": 2341936} +{"current_steps": 894, "total_steps": 1000, "loss": 1.1234, "learning_rate": 3.3838099392243916e-06, "epoch": 1.5893333333333333, "percentage": 89.4, "cur_time": "2024-08-21 06:46:35", "elapsed_time": "0:20:16", "remaining_time": "0:02:24", "throughput": "1933.74", "total_tokens": 2351968} +{"current_steps": 897, "total_steps": 1000, "loss": 1.2396, "learning_rate": 3.197023213051337e-06, "epoch": 1.5946666666666667, "percentage": 89.7, "cur_time": "2024-08-21 06:46:39", "elapsed_time": "0:20:20", "remaining_time": "0:02:20", "throughput": "1934.58", "total_tokens": 2361200} +{"current_steps": 900, "total_steps": 1000, "loss": 1.2116, "learning_rate": 3.0153689607045845e-06, "epoch": 1.6, "percentage": 90.0, "cur_time": "2024-08-21 06:46:43", "elapsed_time": "0:20:24", "remaining_time": "0:02:16", "throughput": "1934.16", "total_tokens": 2367984} +{"current_steps": 903, "total_steps": 1000, "loss": 1.1365, "learning_rate": 2.8388671026199522e-06, "epoch": 1.6053333333333333, "percentage": 90.3, "cur_time": "2024-08-21 06:46:47", "elapsed_time": "0:20:28", "remaining_time": "0:02:11", "throughput": "1933.66", "total_tokens": 2375360} +{"current_steps": 906, "total_steps": 1000, "loss": 1.2196, "learning_rate": 2.667536994215186e-06, "epoch": 1.6106666666666667, "percentage": 90.6, "cur_time": "2024-08-21 06:46:51", "elapsed_time": "0:20:32", "remaining_time": "0:02:07", "throughput": "1933.91", "total_tokens": 2383344} +{"current_steps": 909, "total_steps": 1000, "loss": 1.2588, "learning_rate": 2.501397423767382e-06, "epoch": 1.616, "percentage": 90.9, "cur_time": "2024-08-21 06:46:55", "elapsed_time": "0:20:36", "remaining_time": "0:02:03", "throughput": "1933.63", "total_tokens": 2390464} +{"current_steps": 912, "total_steps": 1000, "loss": 1.195, "learning_rate": 2.340466610352654e-06, "epoch": 1.6213333333333333, "percentage": 91.2, "cur_time": "2024-08-21 06:46:59", "elapsed_time": "0:20:40", "remaining_time": "0:01:59", "throughput": "1934.02", "total_tokens": 2398736} +{"current_steps": 915, "total_steps": 1000, "loss": 1.2392, "learning_rate": 2.1847622018482283e-06, "epoch": 1.6266666666666667, "percentage": 91.5, "cur_time": "2024-08-21 06:47:03", "elapsed_time": "0:20:44", "remaining_time": "0:01:55", "throughput": "1934.32", "total_tokens": 2406864} +{"current_steps": 918, "total_steps": 1000, "loss": 1.104, "learning_rate": 2.0343012729971243e-06, "epoch": 1.6320000000000001, "percentage": 91.8, "cur_time": "2024-08-21 06:47:06", "elapsed_time": "0:20:48", "remaining_time": "0:01:51", "throughput": "1934.23", "total_tokens": 2414304} +{"current_steps": 921, "total_steps": 1000, "loss": 1.1601, "learning_rate": 1.8891003235357308e-06, "epoch": 1.6373333333333333, "percentage": 92.1, "cur_time": "2024-08-21 06:47:10", "elapsed_time": "0:20:52", "remaining_time": "0:01:47", "throughput": "1934.17", "total_tokens": 2421648} +{"current_steps": 924, "total_steps": 1000, "loss": 1.2597, "learning_rate": 1.7491752763844293e-06, "epoch": 1.6426666666666667, "percentage": 92.4, "cur_time": "2024-08-21 06:47:14", "elapsed_time": "0:20:56", "remaining_time": "0:01:43", "throughput": "1934.58", "total_tokens": 2429856} +{"current_steps": 927, "total_steps": 1000, "loss": 1.0914, "learning_rate": 1.6145414759014431e-06, "epoch": 1.6480000000000001, "percentage": 92.7, "cur_time": "2024-08-21 06:47:18", "elapsed_time": "0:20:59", "remaining_time": "0:01:39", "throughput": "1934.80", "total_tokens": 2437840} +{"current_steps": 930, "total_steps": 1000, "loss": 1.1627, "learning_rate": 1.4852136862001764e-06, "epoch": 1.6533333333333333, "percentage": 93.0, "cur_time": "2024-08-21 06:47:22", "elapsed_time": "0:21:04", "remaining_time": "0:01:35", "throughput": "1935.45", "total_tokens": 2446560} +{"current_steps": 933, "total_steps": 1000, "loss": 1.1386, "learning_rate": 1.3612060895301759e-06, "epoch": 1.6586666666666665, "percentage": 93.3, "cur_time": "2024-08-21 06:47:26", "elapsed_time": "0:21:07", "remaining_time": "0:01:31", "throughput": "1934.92", "total_tokens": 2453360} +{"current_steps": 936, "total_steps": 1000, "loss": 1.29, "learning_rate": 1.2425322847218368e-06, "epoch": 1.6640000000000001, "percentage": 93.6, "cur_time": "2024-08-21 06:47:30", "elapsed_time": "0:21:11", "remaining_time": "0:01:26", "throughput": "1935.22", "total_tokens": 2461600} +{"current_steps": 939, "total_steps": 1000, "loss": 1.0164, "learning_rate": 1.1292052856952062e-06, "epoch": 1.6693333333333333, "percentage": 93.9, "cur_time": "2024-08-21 06:47:34", "elapsed_time": "0:21:15", "remaining_time": "0:01:22", "throughput": "1935.15", "total_tokens": 2469168} +{"current_steps": 942, "total_steps": 1000, "loss": 1.0143, "learning_rate": 1.0212375200327973e-06, "epoch": 1.6746666666666665, "percentage": 94.2, "cur_time": "2024-08-21 06:47:38", "elapsed_time": "0:21:19", "remaining_time": "0:01:18", "throughput": "1935.21", "total_tokens": 2476688} +{"current_steps": 945, "total_steps": 1000, "loss": 1.1762, "learning_rate": 9.186408276168013e-07, "epoch": 1.6800000000000002, "percentage": 94.5, "cur_time": "2024-08-21 06:47:42", "elapsed_time": "0:21:23", "remaining_time": "0:01:14", "throughput": "1935.61", "total_tokens": 2485136} +{"current_steps": 948, "total_steps": 1000, "loss": 1.2691, "learning_rate": 8.214264593307098e-07, "epoch": 1.6853333333333333, "percentage": 94.8, "cur_time": "2024-08-21 06:47:46", "elapsed_time": "0:21:27", "remaining_time": "0:01:10", "throughput": "1935.09", "total_tokens": 2491568} +{"current_steps": 951, "total_steps": 1000, "loss": 1.3668, "learning_rate": 7.296050758254957e-07, "epoch": 1.6906666666666665, "percentage": 95.1, "cur_time": "2024-08-21 06:47:50", "elapsed_time": "0:21:31", "remaining_time": "0:01:06", "throughput": "1935.28", "total_tokens": 2499312} +{"current_steps": 954, "total_steps": 1000, "loss": 1.3101, "learning_rate": 6.431867463506048e-07, "epoch": 1.696, "percentage": 95.4, "cur_time": "2024-08-21 06:47:53", "elapsed_time": "0:21:35", "remaining_time": "0:01:02", "throughput": "1934.98", "total_tokens": 2506160} +{"current_steps": 957, "total_steps": 1000, "loss": 1.3651, "learning_rate": 5.621809476497098e-07, "epoch": 1.7013333333333334, "percentage": 95.7, "cur_time": "2024-08-21 06:47:57", "elapsed_time": "0:21:39", "remaining_time": "0:00:58", "throughput": "1935.20", "total_tokens": 2514000} +{"current_steps": 960, "total_steps": 1000, "loss": 1.0573, "learning_rate": 4.865965629214819e-07, "epoch": 1.7066666666666666, "percentage": 96.0, "cur_time": "2024-08-21 06:48:01", "elapsed_time": "0:21:42", "remaining_time": "0:00:54", "throughput": "1934.54", "total_tokens": 2520416} +{"current_steps": 963, "total_steps": 1000, "loss": 1.3964, "learning_rate": 4.1644188084548063e-07, "epoch": 1.712, "percentage": 96.3, "cur_time": "2024-08-21 06:48:05", "elapsed_time": "0:21:46", "remaining_time": "0:00:50", "throughput": "1934.39", "total_tokens": 2528112} +{"current_steps": 966, "total_steps": 1000, "loss": 1.1651, "learning_rate": 3.517245946731529e-07, "epoch": 1.7173333333333334, "percentage": 96.6, "cur_time": "2024-08-21 06:48:09", "elapsed_time": "0:21:50", "remaining_time": "0:00:46", "throughput": "1934.24", "total_tokens": 2535360} +{"current_steps": 969, "total_steps": 1000, "loss": 1.1533, "learning_rate": 2.924518013842303e-07, "epoch": 1.7226666666666666, "percentage": 96.9, "cur_time": "2024-08-21 06:48:13", "elapsed_time": "0:21:54", "remaining_time": "0:00:42", "throughput": "1934.84", "total_tokens": 2543792} +{"current_steps": 972, "total_steps": 1000, "loss": 1.2354, "learning_rate": 2.386300009084408e-07, "epoch": 1.728, "percentage": 97.2, "cur_time": "2024-08-21 06:48:17", "elapsed_time": "0:21:58", "remaining_time": "0:00:37", "throughput": "1936.05", "total_tokens": 2553136} +{"current_steps": 975, "total_steps": 1000, "loss": 1.1855, "learning_rate": 1.9026509541272275e-07, "epoch": 1.7333333333333334, "percentage": 97.5, "cur_time": "2024-08-21 06:48:21", "elapsed_time": "0:22:02", "remaining_time": "0:00:33", "throughput": "1936.76", "total_tokens": 2561840} +{"current_steps": 978, "total_steps": 1000, "loss": 0.9797, "learning_rate": 1.4736238865398765e-07, "epoch": 1.7386666666666666, "percentage": 97.8, "cur_time": "2024-08-21 06:48:25", "elapsed_time": "0:22:06", "remaining_time": "0:00:29", "throughput": "1937.26", "total_tokens": 2570032} +{"current_steps": 981, "total_steps": 1000, "loss": 1.1833, "learning_rate": 1.0992658539750178e-07, "epoch": 1.744, "percentage": 98.1, "cur_time": "2024-08-21 06:48:29", "elapsed_time": "0:22:10", "remaining_time": "0:00:25", "throughput": "1937.42", "total_tokens": 2577888} +{"current_steps": 984, "total_steps": 1000, "loss": 1.3613, "learning_rate": 7.796179090094891e-08, "epoch": 1.7493333333333334, "percentage": 98.4, "cur_time": "2024-08-21 06:48:33", "elapsed_time": "0:22:14", "remaining_time": "0:00:21", "throughput": "1938.20", "total_tokens": 2586448} +{"current_steps": 987, "total_steps": 1000, "loss": 1.1256, "learning_rate": 5.1471510464268236e-08, "epoch": 1.7546666666666666, "percentage": 98.7, "cur_time": "2024-08-21 06:48:37", "elapsed_time": "0:22:18", "remaining_time": "0:00:17", "throughput": "1938.18", "total_tokens": 2593744} +{"current_steps": 990, "total_steps": 1000, "loss": 1.1567, "learning_rate": 3.04586490452119e-08, "epoch": 1.76, "percentage": 99.0, "cur_time": "2024-08-21 06:48:40", "elapsed_time": "0:22:22", "remaining_time": "0:00:13", "throughput": "1938.64", "total_tokens": 2602048} +{"current_steps": 993, "total_steps": 1000, "loss": 1.302, "learning_rate": 1.4925510940844156e-08, "epoch": 1.7653333333333334, "percentage": 99.3, "cur_time": "2024-08-21 06:48:44", "elapsed_time": "0:22:26", "remaining_time": "0:00:09", "throughput": "1939.19", "total_tokens": 2610160} +{"current_steps": 996, "total_steps": 1000, "loss": 1.21, "learning_rate": 4.873799534788059e-09, "epoch": 1.7706666666666666, "percentage": 99.6, "cur_time": "2024-08-21 06:48:48", "elapsed_time": "0:22:29", "remaining_time": "0:00:05", "throughput": "1939.19", "total_tokens": 2617616} +{"current_steps": 999, "total_steps": 1000, "loss": 1.0737, "learning_rate": 3.0461711048035415e-10, "epoch": 1.776, "percentage": 99.9, "cur_time": "2024-08-21 06:48:52", "elapsed_time": "0:22:33", "remaining_time": "0:00:01", "throughput": "1938.85", "total_tokens": 2624416} +{"current_steps": 1000, "total_steps": 1000, "eval_loss": 1.258375644683838, "epoch": 1.7777777777777777, "percentage": 100.0, "cur_time": "2024-08-21 06:49:15", "elapsed_time": "0:22:56", "remaining_time": "0:00:00", "throughput": "1907.76", "total_tokens": 2626832} +{"current_steps": 1000, "total_steps": 1000, "epoch": 1.7777777777777777, "percentage": 100.0, "cur_time": "2024-08-21 06:49:15", "elapsed_time": "0:22:57", "remaining_time": "0:00:00", "throughput": "1907.42", "total_tokens": 2626832} diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/trainer_state.json b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/trainer_state.json new file mode 100644 index 00000000..5b5a2e9f --- /dev/null +++ b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/trainer_state.json @@ -0,0 +1,2725 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7777777777777777, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005333333333333333, + "grad_norm": 0.3827691078186035, + "learning_rate": 3e-06, + "loss": 1.5586, + "num_input_tokens_seen": 9920, + "step": 3 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.3326924741268158, + "learning_rate": 6e-06, + "loss": 1.6295, + "num_input_tokens_seen": 19376, + "step": 6 + }, + { + "epoch": 0.016, + "grad_norm": 0.36344507336616516, + "learning_rate": 9e-06, + "loss": 1.7438, + "num_input_tokens_seen": 29488, + "step": 9 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.4467258155345917, + "learning_rate": 1.2e-05, + "loss": 1.8413, + "num_input_tokens_seen": 38208, + "step": 12 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.40837302803993225, + "learning_rate": 1.5e-05, + "loss": 1.5369, + "num_input_tokens_seen": 44624, + "step": 15 + }, + { + "epoch": 0.032, + "grad_norm": 0.6898334622383118, + "learning_rate": 1.8e-05, + "loss": 1.77, + "num_input_tokens_seen": 52416, + "step": 18 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.5511844158172607, + "learning_rate": 2.1e-05, + "loss": 1.6273, + "num_input_tokens_seen": 60368, + "step": 21 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.8902711272239685, + "learning_rate": 2.4e-05, + "loss": 1.9737, + "num_input_tokens_seen": 66784, + "step": 24 + }, + { + "epoch": 0.048, + "grad_norm": 0.5174709558486938, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.6169, + "num_input_tokens_seen": 73344, + "step": 27 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.6341367363929749, + "learning_rate": 3e-05, + "loss": 1.8011, + "num_input_tokens_seen": 82032, + "step": 30 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.43879804015159607, + "learning_rate": 3.3e-05, + "loss": 1.773, + "num_input_tokens_seen": 89808, + "step": 33 + }, + { + "epoch": 0.064, + "grad_norm": 0.6926860213279724, + "learning_rate": 3.6e-05, + "loss": 1.6426, + "num_input_tokens_seen": 96080, + "step": 36 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.8264650106430054, + "learning_rate": 3.9000000000000006e-05, + "loss": 1.548, + "num_input_tokens_seen": 102784, + "step": 39 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.47357088327407837, + "learning_rate": 4.2e-05, + "loss": 1.5749, + "num_input_tokens_seen": 111184, + "step": 42 + }, + { + "epoch": 0.08, + "grad_norm": 0.5448750853538513, + "learning_rate": 4.5e-05, + "loss": 1.7287, + "num_input_tokens_seen": 116784, + "step": 45 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.6237770318984985, + "learning_rate": 4.8e-05, + "loss": 1.4529, + "num_input_tokens_seen": 125472, + "step": 48 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.68182373046875, + "learning_rate": 5.1000000000000006e-05, + "loss": 1.6277, + "num_input_tokens_seen": 133360, + "step": 51 + }, + { + "epoch": 0.096, + "grad_norm": 0.7576949000358582, + "learning_rate": 5.4000000000000005e-05, + "loss": 1.6691, + "num_input_tokens_seen": 140336, + "step": 54 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.7188912630081177, + "learning_rate": 5.6999999999999996e-05, + "loss": 1.3956, + "num_input_tokens_seen": 148112, + "step": 57 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.6228360533714294, + "learning_rate": 6e-05, + "loss": 1.545, + "num_input_tokens_seen": 156480, + "step": 60 + }, + { + "epoch": 0.112, + "grad_norm": 0.6807077527046204, + "learning_rate": 6.3e-05, + "loss": 1.5088, + "num_input_tokens_seen": 164048, + "step": 63 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.9484089612960815, + "learning_rate": 6.6e-05, + "loss": 1.484, + "num_input_tokens_seen": 172192, + "step": 66 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.6590979695320129, + "learning_rate": 6.9e-05, + "loss": 1.651, + "num_input_tokens_seen": 180528, + "step": 69 + }, + { + "epoch": 0.128, + "grad_norm": 1.332999587059021, + "learning_rate": 7.2e-05, + "loss": 1.4958, + "num_input_tokens_seen": 187728, + "step": 72 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.6886986494064331, + "learning_rate": 7.500000000000001e-05, + "loss": 1.499, + "num_input_tokens_seen": 194592, + "step": 75 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.5862389206886292, + "learning_rate": 7.800000000000001e-05, + "loss": 1.3138, + "num_input_tokens_seen": 202416, + "step": 78 + }, + { + "epoch": 0.144, + "grad_norm": 0.3789741098880768, + "learning_rate": 8.1e-05, + "loss": 1.2772, + "num_input_tokens_seen": 211328, + "step": 81 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 1.1701974868774414, + "learning_rate": 8.4e-05, + "loss": 1.4917, + "num_input_tokens_seen": 218544, + "step": 84 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.4418068528175354, + "learning_rate": 8.7e-05, + "loss": 1.4778, + "num_input_tokens_seen": 227632, + "step": 87 + }, + { + "epoch": 0.16, + "grad_norm": 0.5291661024093628, + "learning_rate": 9e-05, + "loss": 1.2799, + "num_input_tokens_seen": 235440, + "step": 90 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.40466922521591187, + "learning_rate": 9.300000000000001e-05, + "loss": 1.4626, + "num_input_tokens_seen": 244032, + "step": 93 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.9963971972465515, + "learning_rate": 9.6e-05, + "loss": 1.5386, + "num_input_tokens_seen": 250480, + "step": 96 + }, + { + "epoch": 0.176, + "grad_norm": 0.5913766026496887, + "learning_rate": 9.900000000000001e-05, + "loss": 1.4591, + "num_input_tokens_seen": 259600, + "step": 99 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.6860774755477905, + "learning_rate": 9.999878153526974e-05, + "loss": 1.3403, + "num_input_tokens_seen": 267216, + "step": 102 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.7265311479568481, + "learning_rate": 9.999238475781957e-05, + "loss": 1.2782, + "num_input_tokens_seen": 274624, + "step": 105 + }, + { + "epoch": 0.192, + "grad_norm": 0.7685508728027344, + "learning_rate": 9.998050575201771e-05, + "loss": 1.28, + "num_input_tokens_seen": 280832, + "step": 108 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.6996146440505981, + "learning_rate": 9.996314582053106e-05, + "loss": 1.476, + "num_input_tokens_seen": 288544, + "step": 111 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.8047633171081543, + "learning_rate": 9.99403068670717e-05, + "loss": 1.4419, + "num_input_tokens_seen": 296560, + "step": 114 + }, + { + "epoch": 0.208, + "grad_norm": 0.46957364678382874, + "learning_rate": 9.991199139618827e-05, + "loss": 1.3927, + "num_input_tokens_seen": 303600, + "step": 117 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.6611707806587219, + "learning_rate": 9.987820251299122e-05, + "loss": 1.3418, + "num_input_tokens_seen": 310800, + "step": 120 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 1.3713186979293823, + "learning_rate": 9.983894392281237e-05, + "loss": 1.2896, + "num_input_tokens_seen": 317936, + "step": 123 + }, + { + "epoch": 0.224, + "grad_norm": 0.40376824140548706, + "learning_rate": 9.979421993079852e-05, + "loss": 1.4422, + "num_input_tokens_seen": 325936, + "step": 126 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.6310911178588867, + "learning_rate": 9.974403544143941e-05, + "loss": 1.319, + "num_input_tokens_seen": 334592, + "step": 129 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.6696099638938904, + "learning_rate": 9.968839595802982e-05, + "loss": 1.1798, + "num_input_tokens_seen": 341200, + "step": 132 + }, + { + "epoch": 0.24, + "grad_norm": 0.5329192876815796, + "learning_rate": 9.962730758206611e-05, + "loss": 1.2938, + "num_input_tokens_seen": 347888, + "step": 135 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.41395628452301025, + "learning_rate": 9.956077701257709e-05, + "loss": 1.4067, + "num_input_tokens_seen": 356864, + "step": 138 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.49461355805397034, + "learning_rate": 9.948881154538945e-05, + "loss": 1.3325, + "num_input_tokens_seen": 365440, + "step": 141 + }, + { + "epoch": 0.256, + "grad_norm": 0.47386232018470764, + "learning_rate": 9.941141907232765e-05, + "loss": 1.3159, + "num_input_tokens_seen": 373312, + "step": 144 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.9276071786880493, + "learning_rate": 9.932860808034848e-05, + "loss": 1.4941, + "num_input_tokens_seen": 383552, + "step": 147 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.7177005410194397, + "learning_rate": 9.924038765061042e-05, + "loss": 1.2139, + "num_input_tokens_seen": 390928, + "step": 150 + }, + { + "epoch": 0.272, + "grad_norm": 0.5159232020378113, + "learning_rate": 9.914676745747772e-05, + "loss": 1.3313, + "num_input_tokens_seen": 398224, + "step": 153 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.651279866695404, + "learning_rate": 9.904775776745958e-05, + "loss": 1.2673, + "num_input_tokens_seen": 406656, + "step": 156 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.7800387740135193, + "learning_rate": 9.894336943808426e-05, + "loss": 1.357, + "num_input_tokens_seen": 413584, + "step": 159 + }, + { + "epoch": 0.288, + "grad_norm": 0.9104688763618469, + "learning_rate": 9.88336139167084e-05, + "loss": 1.2238, + "num_input_tokens_seen": 420240, + "step": 162 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.8791072964668274, + "learning_rate": 9.871850323926177e-05, + "loss": 1.2388, + "num_input_tokens_seen": 426608, + "step": 165 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.6322370767593384, + "learning_rate": 9.859805002892732e-05, + "loss": 1.3229, + "num_input_tokens_seen": 434224, + "step": 168 + }, + { + "epoch": 0.304, + "grad_norm": 0.5572423338890076, + "learning_rate": 9.847226749475695e-05, + "loss": 1.387, + "num_input_tokens_seen": 443808, + "step": 171 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 1.0534051656723022, + "learning_rate": 9.834116943022298e-05, + "loss": 1.5095, + "num_input_tokens_seen": 452080, + "step": 174 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.581840991973877, + "learning_rate": 9.820477021170551e-05, + "loss": 1.2474, + "num_input_tokens_seen": 460528, + "step": 177 + }, + { + "epoch": 0.32, + "grad_norm": 0.6569491028785706, + "learning_rate": 9.806308479691595e-05, + "loss": 1.2774, + "num_input_tokens_seen": 469856, + "step": 180 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.6684996485710144, + "learning_rate": 9.791612872325667e-05, + "loss": 1.2836, + "num_input_tokens_seen": 477216, + "step": 183 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.5214691758155823, + "learning_rate": 9.776391810611718e-05, + "loss": 1.4973, + "num_input_tokens_seen": 485392, + "step": 186 + }, + { + "epoch": 0.336, + "grad_norm": 0.60368812084198, + "learning_rate": 9.760646963710694e-05, + "loss": 1.2715, + "num_input_tokens_seen": 492832, + "step": 189 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 1.012483835220337, + "learning_rate": 9.744380058222483e-05, + "loss": 1.303, + "num_input_tokens_seen": 500112, + "step": 192 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.5348647236824036, + "learning_rate": 9.727592877996585e-05, + "loss": 1.3055, + "num_input_tokens_seen": 508384, + "step": 195 + }, + { + "epoch": 0.352, + "grad_norm": 0.6294691562652588, + "learning_rate": 9.710287263936484e-05, + "loss": 1.551, + "num_input_tokens_seen": 516576, + "step": 198 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.6876276731491089, + "learning_rate": 9.69246511379778e-05, + "loss": 1.4759, + "num_input_tokens_seen": 523456, + "step": 201 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.6463469862937927, + "learning_rate": 9.674128381980072e-05, + "loss": 1.358, + "num_input_tokens_seen": 530768, + "step": 204 + }, + { + "epoch": 0.368, + "grad_norm": 0.41761478781700134, + "learning_rate": 9.655279079312642e-05, + "loss": 1.2757, + "num_input_tokens_seen": 538288, + "step": 207 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.6983274817466736, + "learning_rate": 9.635919272833938e-05, + "loss": 1.2448, + "num_input_tokens_seen": 546816, + "step": 210 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.49796414375305176, + "learning_rate": 9.616051085564906e-05, + "loss": 1.3256, + "num_input_tokens_seen": 554800, + "step": 213 + }, + { + "epoch": 0.384, + "grad_norm": 0.9893045425415039, + "learning_rate": 9.595676696276172e-05, + "loss": 1.254, + "num_input_tokens_seen": 560368, + "step": 216 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 1.2111352682113647, + "learning_rate": 9.574798339249125e-05, + "loss": 1.3174, + "num_input_tokens_seen": 567504, + "step": 219 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.6263272166252136, + "learning_rate": 9.553418304030886e-05, + "loss": 1.1809, + "num_input_tokens_seen": 575440, + "step": 222 + }, + { + "epoch": 0.4, + "grad_norm": 0.9461821913719177, + "learning_rate": 9.53153893518325e-05, + "loss": 1.3219, + "num_input_tokens_seen": 583024, + "step": 225 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.5056630969047546, + "learning_rate": 9.50916263202557e-05, + "loss": 1.4601, + "num_input_tokens_seen": 591840, + "step": 228 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.4600001573562622, + "learning_rate": 9.486291848371643e-05, + "loss": 1.3049, + "num_input_tokens_seen": 599392, + "step": 231 + }, + { + "epoch": 0.416, + "grad_norm": 0.587371826171875, + "learning_rate": 9.462929092260628e-05, + "loss": 1.4101, + "num_input_tokens_seen": 606560, + "step": 234 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.5228914022445679, + "learning_rate": 9.439076925682006e-05, + "loss": 1.1738, + "num_input_tokens_seen": 615216, + "step": 237 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.5150928497314453, + "learning_rate": 9.414737964294636e-05, + "loss": 1.3717, + "num_input_tokens_seen": 623696, + "step": 240 + }, + { + "epoch": 0.432, + "grad_norm": 0.4762982130050659, + "learning_rate": 9.389914877139903e-05, + "loss": 1.3903, + "num_input_tokens_seen": 631568, + "step": 243 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.43511807918548584, + "learning_rate": 9.364610386349049e-05, + "loss": 1.3946, + "num_input_tokens_seen": 639424, + "step": 246 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.6298917531967163, + "learning_rate": 9.338827266844644e-05, + "loss": 1.22, + "num_input_tokens_seen": 647696, + "step": 249 + }, + { + "epoch": 0.448, + "grad_norm": 0.5033426880836487, + "learning_rate": 9.312568346036288e-05, + "loss": 1.3776, + "num_input_tokens_seen": 655600, + "step": 252 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.639208972454071, + "learning_rate": 9.285836503510562e-05, + "loss": 1.3568, + "num_input_tokens_seen": 663200, + "step": 255 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.7679170966148376, + "learning_rate": 9.258634670715238e-05, + "loss": 1.2346, + "num_input_tokens_seen": 670256, + "step": 258 + }, + { + "epoch": 0.464, + "grad_norm": 0.5441117286682129, + "learning_rate": 9.230965830637821e-05, + "loss": 1.4332, + "num_input_tokens_seen": 678304, + "step": 261 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.638019323348999, + "learning_rate": 9.202833017478422e-05, + "loss": 1.3245, + "num_input_tokens_seen": 687392, + "step": 264 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.41635143756866455, + "learning_rate": 9.174239316317033e-05, + "loss": 1.3583, + "num_input_tokens_seen": 695344, + "step": 267 + }, + { + "epoch": 0.48, + "grad_norm": 1.1276559829711914, + "learning_rate": 9.145187862775209e-05, + "loss": 1.271, + "num_input_tokens_seen": 704736, + "step": 270 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.78920978307724, + "learning_rate": 9.11568184267221e-05, + "loss": 1.207, + "num_input_tokens_seen": 710720, + "step": 273 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.36885419487953186, + "learning_rate": 9.085724491675642e-05, + "loss": 1.1755, + "num_input_tokens_seen": 718128, + "step": 276 + }, + { + "epoch": 0.496, + "grad_norm": 0.7129054069519043, + "learning_rate": 9.055319094946633e-05, + "loss": 1.2883, + "num_input_tokens_seen": 725568, + "step": 279 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.8274949789047241, + "learning_rate": 9.02446898677957e-05, + "loss": 1.314, + "num_input_tokens_seen": 732928, + "step": 282 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.6434007883071899, + "learning_rate": 8.993177550236464e-05, + "loss": 1.2864, + "num_input_tokens_seen": 741536, + "step": 285 + }, + { + "epoch": 0.512, + "grad_norm": 0.5015966296195984, + "learning_rate": 8.961448216775954e-05, + "loss": 1.227, + "num_input_tokens_seen": 748288, + "step": 288 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.4610384702682495, + "learning_rate": 8.92928446587701e-05, + "loss": 1.3016, + "num_input_tokens_seen": 757216, + "step": 291 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.7528018355369568, + "learning_rate": 8.896689824657372e-05, + "loss": 1.2786, + "num_input_tokens_seen": 764240, + "step": 294 + }, + { + "epoch": 0.528, + "grad_norm": 0.5625883340835571, + "learning_rate": 8.863667867486756e-05, + "loss": 1.3358, + "num_input_tokens_seen": 774416, + "step": 297 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.720058798789978, + "learning_rate": 8.83022221559489e-05, + "loss": 1.3911, + "num_input_tokens_seen": 782656, + "step": 300 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.4910522997379303, + "learning_rate": 8.796356536674403e-05, + "loss": 1.193, + "num_input_tokens_seen": 790032, + "step": 303 + }, + { + "epoch": 0.544, + "grad_norm": 0.6103739142417908, + "learning_rate": 8.762074544478623e-05, + "loss": 1.2034, + "num_input_tokens_seen": 798048, + "step": 306 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.5440624356269836, + "learning_rate": 8.727379998414311e-05, + "loss": 1.1278, + "num_input_tokens_seen": 805792, + "step": 309 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.7663698792457581, + "learning_rate": 8.692276703129421e-05, + "loss": 1.3952, + "num_input_tokens_seen": 813280, + "step": 312 + }, + { + "epoch": 0.56, + "grad_norm": 0.45255619287490845, + "learning_rate": 8.656768508095853e-05, + "loss": 1.3797, + "num_input_tokens_seen": 822464, + "step": 315 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.5178409218788147, + "learning_rate": 8.620859307187339e-05, + "loss": 1.311, + "num_input_tokens_seen": 832672, + "step": 318 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.44465571641921997, + "learning_rate": 8.584553038252414e-05, + "loss": 1.3417, + "num_input_tokens_seen": 841248, + "step": 321 + }, + { + "epoch": 0.576, + "grad_norm": 0.5521979331970215, + "learning_rate": 8.547853682682604e-05, + "loss": 1.2492, + "num_input_tokens_seen": 848432, + "step": 324 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.5571572780609131, + "learning_rate": 8.510765264975813e-05, + "loss": 1.4109, + "num_input_tokens_seen": 858096, + "step": 327 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.389121949672699, + "learning_rate": 8.473291852294987e-05, + "loss": 1.3382, + "num_input_tokens_seen": 866784, + "step": 330 + }, + { + "epoch": 0.592, + "grad_norm": 0.491073876619339, + "learning_rate": 8.435437554022115e-05, + "loss": 1.2248, + "num_input_tokens_seen": 873904, + "step": 333 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.7570438385009766, + "learning_rate": 8.397206521307584e-05, + "loss": 1.1435, + "num_input_tokens_seen": 879856, + "step": 336 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.5216025114059448, + "learning_rate": 8.358602946614951e-05, + "loss": 1.1193, + "num_input_tokens_seen": 887152, + "step": 339 + }, + { + "epoch": 0.608, + "grad_norm": 0.47078531980514526, + "learning_rate": 8.319631063261209e-05, + "loss": 1.2794, + "num_input_tokens_seen": 896752, + "step": 342 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.5252036452293396, + "learning_rate": 8.280295144952536e-05, + "loss": 1.2186, + "num_input_tokens_seen": 905696, + "step": 345 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.5388907790184021, + "learning_rate": 8.240599505315655e-05, + "loss": 1.2683, + "num_input_tokens_seen": 913536, + "step": 348 + }, + { + "epoch": 0.624, + "grad_norm": 0.5209746360778809, + "learning_rate": 8.200548497424778e-05, + "loss": 1.1247, + "num_input_tokens_seen": 919584, + "step": 351 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.4642646014690399, + "learning_rate": 8.160146513324254e-05, + "loss": 1.4098, + "num_input_tokens_seen": 927248, + "step": 354 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.501924991607666, + "learning_rate": 8.119397983546932e-05, + "loss": 1.2803, + "num_input_tokens_seen": 937328, + "step": 357 + }, + { + "epoch": 0.64, + "grad_norm": 0.39491286873817444, + "learning_rate": 8.07830737662829e-05, + "loss": 1.4474, + "num_input_tokens_seen": 946400, + "step": 360 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 1.1283897161483765, + "learning_rate": 8.036879198616434e-05, + "loss": 1.3626, + "num_input_tokens_seen": 954912, + "step": 363 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.5616379976272583, + "learning_rate": 7.99511799257793e-05, + "loss": 1.2167, + "num_input_tokens_seen": 963056, + "step": 366 + }, + { + "epoch": 0.656, + "grad_norm": 0.7390901446342468, + "learning_rate": 7.953028338099627e-05, + "loss": 1.2025, + "num_input_tokens_seen": 970048, + "step": 369 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.9030864238739014, + "learning_rate": 7.910614850786448e-05, + "loss": 1.313, + "num_input_tokens_seen": 977312, + "step": 372 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.61843341588974, + "learning_rate": 7.86788218175523e-05, + "loss": 1.479, + "num_input_tokens_seen": 984576, + "step": 375 + }, + { + "epoch": 0.672, + "grad_norm": 0.5310498476028442, + "learning_rate": 7.82483501712469e-05, + "loss": 1.435, + "num_input_tokens_seen": 992464, + "step": 378 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.5555992126464844, + "learning_rate": 7.781478077501525e-05, + "loss": 1.1344, + "num_input_tokens_seen": 1000400, + "step": 381 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.4795224070549011, + "learning_rate": 7.737816117462752e-05, + "loss": 1.2989, + "num_input_tokens_seen": 1007952, + "step": 384 + }, + { + "epoch": 0.688, + "grad_norm": 0.6490950584411621, + "learning_rate": 7.693853925034315e-05, + "loss": 1.0671, + "num_input_tokens_seen": 1014064, + "step": 387 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 1.0598840713500977, + "learning_rate": 7.649596321166024e-05, + "loss": 1.2786, + "num_input_tokens_seen": 1021472, + "step": 390 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.5905397534370422, + "learning_rate": 7.605048159202883e-05, + "loss": 1.4096, + "num_input_tokens_seen": 1028848, + "step": 393 + }, + { + "epoch": 0.704, + "grad_norm": 0.6974928975105286, + "learning_rate": 7.560214324352858e-05, + "loss": 1.1785, + "num_input_tokens_seen": 1037760, + "step": 396 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.6378306150436401, + "learning_rate": 7.515099733151177e-05, + "loss": 1.2757, + "num_input_tokens_seen": 1045472, + "step": 399 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.6533056497573853, + "learning_rate": 7.469709332921155e-05, + "loss": 1.2499, + "num_input_tokens_seen": 1053024, + "step": 402 + }, + { + "epoch": 0.72, + "grad_norm": 0.762304961681366, + "learning_rate": 7.424048101231686e-05, + "loss": 1.2605, + "num_input_tokens_seen": 1060336, + "step": 405 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.5734354853630066, + "learning_rate": 7.378121045351378e-05, + "loss": 1.2975, + "num_input_tokens_seen": 1067184, + "step": 408 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.7655225396156311, + "learning_rate": 7.331933201699457e-05, + "loss": 1.1854, + "num_input_tokens_seen": 1074768, + "step": 411 + }, + { + "epoch": 0.736, + "grad_norm": 0.7301100492477417, + "learning_rate": 7.285489635293472e-05, + "loss": 1.1631, + "num_input_tokens_seen": 1083360, + "step": 414 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.6851321458816528, + "learning_rate": 7.238795439193848e-05, + "loss": 1.2159, + "num_input_tokens_seen": 1091200, + "step": 417 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.513536810874939, + "learning_rate": 7.191855733945387e-05, + "loss": 1.201, + "num_input_tokens_seen": 1100976, + "step": 420 + }, + { + "epoch": 0.752, + "grad_norm": 0.9055917263031006, + "learning_rate": 7.14467566701573e-05, + "loss": 1.3129, + "num_input_tokens_seen": 1109264, + "step": 423 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.5442625880241394, + "learning_rate": 7.097260412230886e-05, + "loss": 1.5195, + "num_input_tokens_seen": 1117568, + "step": 426 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.48822855949401855, + "learning_rate": 7.049615169207864e-05, + "loss": 1.3843, + "num_input_tokens_seen": 1126416, + "step": 429 + }, + { + "epoch": 0.768, + "grad_norm": 0.4853934645652771, + "learning_rate": 7.001745162784477e-05, + "loss": 1.1864, + "num_input_tokens_seen": 1133984, + "step": 432 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.7485060095787048, + "learning_rate": 6.953655642446368e-05, + "loss": 1.4373, + "num_input_tokens_seen": 1142608, + "step": 435 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.6429968476295471, + "learning_rate": 6.905351881751372e-05, + "loss": 1.2651, + "num_input_tokens_seen": 1152304, + "step": 438 + }, + { + "epoch": 0.784, + "grad_norm": 0.6068575978279114, + "learning_rate": 6.856839177751176e-05, + "loss": 1.2864, + "num_input_tokens_seen": 1161072, + "step": 441 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.697914719581604, + "learning_rate": 6.808122850410461e-05, + "loss": 1.1525, + "num_input_tokens_seen": 1168480, + "step": 444 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.8854771852493286, + "learning_rate": 6.759208242023509e-05, + "loss": 1.3601, + "num_input_tokens_seen": 1175392, + "step": 447 + }, + { + "epoch": 0.8, + "grad_norm": 0.5190437436103821, + "learning_rate": 6.710100716628344e-05, + "loss": 1.4018, + "num_input_tokens_seen": 1184544, + "step": 450 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.6468976736068726, + "learning_rate": 6.660805659418516e-05, + "loss": 1.2425, + "num_input_tokens_seen": 1192880, + "step": 453 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.5303670763969421, + "learning_rate": 6.611328476152557e-05, + "loss": 1.2589, + "num_input_tokens_seen": 1200928, + "step": 456 + }, + { + "epoch": 0.816, + "grad_norm": 0.7813239097595215, + "learning_rate": 6.561674592561163e-05, + "loss": 1.3289, + "num_input_tokens_seen": 1209056, + "step": 459 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.4044243395328522, + "learning_rate": 6.511849453752223e-05, + "loss": 1.2851, + "num_input_tokens_seen": 1217040, + "step": 462 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.6317430734634399, + "learning_rate": 6.461858523613684e-05, + "loss": 1.2584, + "num_input_tokens_seen": 1223712, + "step": 465 + }, + { + "epoch": 0.832, + "grad_norm": 0.5451323986053467, + "learning_rate": 6.411707284214384e-05, + "loss": 1.2729, + "num_input_tokens_seen": 1230736, + "step": 468 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.5434067249298096, + "learning_rate": 6.361401235202872e-05, + "loss": 1.3603, + "num_input_tokens_seen": 1237728, + "step": 471 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.41243186593055725, + "learning_rate": 6.310945893204324e-05, + "loss": 1.1509, + "num_input_tokens_seen": 1247568, + "step": 474 + }, + { + "epoch": 0.848, + "grad_norm": 0.9249187111854553, + "learning_rate": 6.26034679121557e-05, + "loss": 1.3986, + "num_input_tokens_seen": 1255968, + "step": 477 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.5903889536857605, + "learning_rate": 6.209609477998338e-05, + "loss": 1.3266, + "num_input_tokens_seen": 1262640, + "step": 480 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.880121111869812, + "learning_rate": 6.158739517470786e-05, + "loss": 1.5078, + "num_input_tokens_seen": 1270464, + "step": 483 + }, + { + "epoch": 0.864, + "grad_norm": 0.8485903143882751, + "learning_rate": 6.107742488097338e-05, + "loss": 1.0109, + "num_input_tokens_seen": 1278400, + "step": 486 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.8262659311294556, + "learning_rate": 6.056623982276944e-05, + "loss": 1.3723, + "num_input_tokens_seen": 1287072, + "step": 489 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.9479841589927673, + "learning_rate": 6.005389605729824e-05, + "loss": 1.3698, + "num_input_tokens_seen": 1294784, + "step": 492 + }, + { + "epoch": 0.88, + "grad_norm": 0.8312945365905762, + "learning_rate": 5.9540449768827246e-05, + "loss": 1.2799, + "num_input_tokens_seen": 1303872, + "step": 495 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.8914958238601685, + "learning_rate": 5.902595726252801e-05, + "loss": 1.1711, + "num_input_tokens_seen": 1309888, + "step": 498 + }, + { + "epoch": 0.8888888888888888, + "eval_loss": 1.2698992490768433, + "eval_runtime": 23.2641, + "eval_samples_per_second": 42.985, + "eval_steps_per_second": 21.492, + "num_input_tokens_seen": 1314896, + "step": 500 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.839598536491394, + "learning_rate": 5.851047495830163e-05, + "loss": 1.3553, + "num_input_tokens_seen": 1317600, + "step": 501 + }, + { + "epoch": 0.896, + "grad_norm": 0.6949290037155151, + "learning_rate": 5.799405938459175e-05, + "loss": 1.4642, + "num_input_tokens_seen": 1326576, + "step": 504 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.594464898109436, + "learning_rate": 5.747676717218549e-05, + "loss": 1.1863, + "num_input_tokens_seen": 1332944, + "step": 507 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.502791166305542, + "learning_rate": 5.695865504800327e-05, + "loss": 1.0854, + "num_input_tokens_seen": 1340672, + "step": 510 + }, + { + "epoch": 0.912, + "grad_norm": 0.8494873046875, + "learning_rate": 5.643977982887815e-05, + "loss": 1.1361, + "num_input_tokens_seen": 1347408, + "step": 513 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.5948024392127991, + "learning_rate": 5.5920198415325064e-05, + "loss": 1.5376, + "num_input_tokens_seen": 1356096, + "step": 516 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.8378577828407288, + "learning_rate": 5.539996778530115e-05, + "loss": 1.3248, + "num_input_tokens_seen": 1364832, + "step": 519 + }, + { + "epoch": 0.928, + "grad_norm": 0.7156303524971008, + "learning_rate": 5.487914498795747e-05, + "loss": 1.2347, + "num_input_tokens_seen": 1371520, + "step": 522 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.6701870560646057, + "learning_rate": 5.435778713738292e-05, + "loss": 1.2789, + "num_input_tokens_seen": 1382112, + "step": 525 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.6556785106658936, + "learning_rate": 5.383595140634093e-05, + "loss": 1.2137, + "num_input_tokens_seen": 1390880, + "step": 528 + }, + { + "epoch": 0.944, + "grad_norm": 0.5082133412361145, + "learning_rate": 5.3313695020000024e-05, + "loss": 1.1884, + "num_input_tokens_seen": 1399248, + "step": 531 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.8579297065734863, + "learning_rate": 5.279107524965819e-05, + "loss": 1.0717, + "num_input_tokens_seen": 1407344, + "step": 534 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.7548760771751404, + "learning_rate": 5.226814940646269e-05, + "loss": 1.341, + "num_input_tokens_seen": 1414864, + "step": 537 + }, + { + "epoch": 0.96, + "grad_norm": 0.42420780658721924, + "learning_rate": 5.174497483512506e-05, + "loss": 1.2898, + "num_input_tokens_seen": 1424144, + "step": 540 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.6996252536773682, + "learning_rate": 5.1221608907632665e-05, + "loss": 1.226, + "num_input_tokens_seen": 1431120, + "step": 543 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.8375110030174255, + "learning_rate": 5.0698109016957274e-05, + "loss": 1.3647, + "num_input_tokens_seen": 1439056, + "step": 546 + }, + { + "epoch": 0.976, + "grad_norm": 0.4961775541305542, + "learning_rate": 5.017453257076119e-05, + "loss": 1.168, + "num_input_tokens_seen": 1447184, + "step": 549 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.7079893350601196, + "learning_rate": 4.965093698510193e-05, + "loss": 1.3466, + "num_input_tokens_seen": 1454992, + "step": 552 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.7946033477783203, + "learning_rate": 4.912737967813583e-05, + "loss": 1.1918, + "num_input_tokens_seen": 1462048, + "step": 555 + }, + { + "epoch": 0.992, + "grad_norm": 0.6242976784706116, + "learning_rate": 4.860391806382157e-05, + "loss": 1.2577, + "num_input_tokens_seen": 1469024, + "step": 558 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.681969940662384, + "learning_rate": 4.8080609545624004e-05, + "loss": 1.2251, + "num_input_tokens_seen": 1476496, + "step": 561 + }, + { + "epoch": 1.0026666666666666, + "grad_norm": 0.4314422011375427, + "learning_rate": 4.755751151021934e-05, + "loss": 1.3152, + "num_input_tokens_seen": 1485568, + "step": 564 + }, + { + "epoch": 1.008, + "grad_norm": 0.7379014492034912, + "learning_rate": 4.703468132120193e-05, + "loss": 1.1243, + "num_input_tokens_seen": 1492544, + "step": 567 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 0.5139598846435547, + "learning_rate": 4.6512176312793736e-05, + "loss": 1.3484, + "num_input_tokens_seen": 1501216, + "step": 570 + }, + { + "epoch": 1.0186666666666666, + "grad_norm": 0.735149085521698, + "learning_rate": 4.599005378355706e-05, + "loss": 1.1443, + "num_input_tokens_seen": 1509824, + "step": 573 + }, + { + "epoch": 1.024, + "grad_norm": 0.5389354228973389, + "learning_rate": 4.5468370990111006e-05, + "loss": 1.1144, + "num_input_tokens_seen": 1516736, + "step": 576 + }, + { + "epoch": 1.0293333333333334, + "grad_norm": 0.6230564713478088, + "learning_rate": 4.494718514085268e-05, + "loss": 1.4006, + "num_input_tokens_seen": 1524208, + "step": 579 + }, + { + "epoch": 1.0346666666666666, + "grad_norm": 0.5797858834266663, + "learning_rate": 4.442655338968373e-05, + "loss": 1.2874, + "num_input_tokens_seen": 1534656, + "step": 582 + }, + { + "epoch": 1.04, + "grad_norm": 0.5728005170822144, + "learning_rate": 4.390653282974264e-05, + "loss": 1.0906, + "num_input_tokens_seen": 1541136, + "step": 585 + }, + { + "epoch": 1.0453333333333332, + "grad_norm": 0.5849784016609192, + "learning_rate": 4.3387180487143876e-05, + "loss": 1.1907, + "num_input_tokens_seen": 1550640, + "step": 588 + }, + { + "epoch": 1.0506666666666666, + "grad_norm": 1.1090894937515259, + "learning_rate": 4.2868553314724425e-05, + "loss": 1.2323, + "num_input_tokens_seen": 1557488, + "step": 591 + }, + { + "epoch": 1.056, + "grad_norm": 0.5953820943832397, + "learning_rate": 4.23507081857981e-05, + "loss": 1.3508, + "num_input_tokens_seen": 1565392, + "step": 594 + }, + { + "epoch": 1.0613333333333332, + "grad_norm": 0.45134949684143066, + "learning_rate": 4.1833701887918904e-05, + "loss": 1.3322, + "num_input_tokens_seen": 1574080, + "step": 597 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.9110301733016968, + "learning_rate": 4.131759111665349e-05, + "loss": 1.0924, + "num_input_tokens_seen": 1581312, + "step": 600 + }, + { + "epoch": 1.072, + "grad_norm": 1.416802167892456, + "learning_rate": 4.080243246936399e-05, + "loss": 1.2264, + "num_input_tokens_seen": 1588096, + "step": 603 + }, + { + "epoch": 1.0773333333333333, + "grad_norm": 0.8100959062576294, + "learning_rate": 4.028828243900141e-05, + "loss": 1.05, + "num_input_tokens_seen": 1594736, + "step": 606 + }, + { + "epoch": 1.0826666666666667, + "grad_norm": 0.5712919235229492, + "learning_rate": 3.9775197407910485e-05, + "loss": 1.3512, + "num_input_tokens_seen": 1603312, + "step": 609 + }, + { + "epoch": 1.088, + "grad_norm": 0.7171860933303833, + "learning_rate": 3.926323364164684e-05, + "loss": 1.281, + "num_input_tokens_seen": 1613232, + "step": 612 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 0.633402407169342, + "learning_rate": 3.875244728280676e-05, + "loss": 1.3661, + "num_input_tokens_seen": 1621936, + "step": 615 + }, + { + "epoch": 1.0986666666666667, + "grad_norm": 0.6984114050865173, + "learning_rate": 3.82428943448705e-05, + "loss": 1.1989, + "num_input_tokens_seen": 1629984, + "step": 618 + }, + { + "epoch": 1.104, + "grad_norm": 1.2278004884719849, + "learning_rate": 3.773463070605987e-05, + "loss": 1.0763, + "num_input_tokens_seen": 1636016, + "step": 621 + }, + { + "epoch": 1.1093333333333333, + "grad_norm": 0.9503414034843445, + "learning_rate": 3.7227712103210486e-05, + "loss": 1.1786, + "num_input_tokens_seen": 1642832, + "step": 624 + }, + { + "epoch": 1.1146666666666667, + "grad_norm": 3.3439748287200928, + "learning_rate": 3.6722194125659556e-05, + "loss": 1.0718, + "num_input_tokens_seen": 1648752, + "step": 627 + }, + { + "epoch": 1.12, + "grad_norm": 0.8299528360366821, + "learning_rate": 3.6218132209150045e-05, + "loss": 1.3182, + "num_input_tokens_seen": 1657488, + "step": 630 + }, + { + "epoch": 1.1253333333333333, + "grad_norm": 0.4776313602924347, + "learning_rate": 3.5715581629751326e-05, + "loss": 1.3274, + "num_input_tokens_seen": 1666720, + "step": 633 + }, + { + "epoch": 1.1306666666666667, + "grad_norm": 0.610567033290863, + "learning_rate": 3.5214597497797684e-05, + "loss": 1.3109, + "num_input_tokens_seen": 1678048, + "step": 636 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.7393361926078796, + "learning_rate": 3.471523475184472e-05, + "loss": 1.287, + "num_input_tokens_seen": 1687120, + "step": 639 + }, + { + "epoch": 1.1413333333333333, + "grad_norm": 0.752165675163269, + "learning_rate": 3.4217548152644885e-05, + "loss": 1.2379, + "num_input_tokens_seen": 1695232, + "step": 642 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 0.7106702327728271, + "learning_rate": 3.372159227714218e-05, + "loss": 1.2971, + "num_input_tokens_seen": 1703424, + "step": 645 + }, + { + "epoch": 1.152, + "grad_norm": 1.0731079578399658, + "learning_rate": 3.322742151248725e-05, + "loss": 1.1837, + "num_input_tokens_seen": 1711504, + "step": 648 + }, + { + "epoch": 1.1573333333333333, + "grad_norm": 0.7986055016517639, + "learning_rate": 3.273509005007327e-05, + "loss": 1.1607, + "num_input_tokens_seen": 1720016, + "step": 651 + }, + { + "epoch": 1.1626666666666667, + "grad_norm": 0.8783122897148132, + "learning_rate": 3.224465187959316e-05, + "loss": 1.1984, + "num_input_tokens_seen": 1725952, + "step": 654 + }, + { + "epoch": 1.168, + "grad_norm": 0.6338449716567993, + "learning_rate": 3.1756160783119016e-05, + "loss": 1.2528, + "num_input_tokens_seen": 1735728, + "step": 657 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 1.1053016185760498, + "learning_rate": 3.12696703292044e-05, + "loss": 1.2614, + "num_input_tokens_seen": 1742160, + "step": 660 + }, + { + "epoch": 1.1786666666666668, + "grad_norm": 0.9506245255470276, + "learning_rate": 3.078523386700982e-05, + "loss": 1.2315, + "num_input_tokens_seen": 1748592, + "step": 663 + }, + { + "epoch": 1.184, + "grad_norm": 0.909778892993927, + "learning_rate": 3.0302904520452447e-05, + "loss": 1.1794, + "num_input_tokens_seen": 1756800, + "step": 666 + }, + { + "epoch": 1.1893333333333334, + "grad_norm": 0.7909402847290039, + "learning_rate": 2.9822735182380496e-05, + "loss": 1.205, + "num_input_tokens_seen": 1764624, + "step": 669 + }, + { + "epoch": 1.1946666666666665, + "grad_norm": 0.7799058556556702, + "learning_rate": 2.934477850877292e-05, + "loss": 1.2932, + "num_input_tokens_seen": 1772688, + "step": 672 + }, + { + "epoch": 1.2, + "grad_norm": 0.912695050239563, + "learning_rate": 2.886908691296504e-05, + "loss": 1.2259, + "num_input_tokens_seen": 1780128, + "step": 675 + }, + { + "epoch": 1.2053333333333334, + "grad_norm": 0.8391047120094299, + "learning_rate": 2.8395712559900877e-05, + "loss": 1.2725, + "num_input_tokens_seen": 1788160, + "step": 678 + }, + { + "epoch": 1.2106666666666666, + "grad_norm": 0.9869920611381531, + "learning_rate": 2.7924707360412746e-05, + "loss": 1.0984, + "num_input_tokens_seen": 1794336, + "step": 681 + }, + { + "epoch": 1.216, + "grad_norm": 1.0749855041503906, + "learning_rate": 2.7456122965528475e-05, + "loss": 1.3735, + "num_input_tokens_seen": 1801280, + "step": 684 + }, + { + "epoch": 1.2213333333333334, + "grad_norm": 0.6031284332275391, + "learning_rate": 2.699001076080742e-05, + "loss": 1.2211, + "num_input_tokens_seen": 1811088, + "step": 687 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 0.86882483959198, + "learning_rate": 2.6526421860705473e-05, + "loss": 1.123, + "num_input_tokens_seen": 1816848, + "step": 690 + }, + { + "epoch": 1.232, + "grad_norm": 0.6549557447433472, + "learning_rate": 2.6065407102969664e-05, + "loss": 1.2838, + "num_input_tokens_seen": 1824992, + "step": 693 + }, + { + "epoch": 1.2373333333333334, + "grad_norm": 0.737458348274231, + "learning_rate": 2.560701704306336e-05, + "loss": 1.1937, + "num_input_tokens_seen": 1831712, + "step": 696 + }, + { + "epoch": 1.2426666666666666, + "grad_norm": 0.5440405607223511, + "learning_rate": 2.5151301948622237e-05, + "loss": 1.2453, + "num_input_tokens_seen": 1842272, + "step": 699 + }, + { + "epoch": 1.248, + "grad_norm": 0.9566605687141418, + "learning_rate": 2.469831179394182e-05, + "loss": 1.1175, + "num_input_tokens_seen": 1847776, + "step": 702 + }, + { + "epoch": 1.2533333333333334, + "grad_norm": 0.5786175727844238, + "learning_rate": 2.4248096254497288e-05, + "loss": 1.2671, + "num_input_tokens_seen": 1856992, + "step": 705 + }, + { + "epoch": 1.2586666666666666, + "grad_norm": 1.0269118547439575, + "learning_rate": 2.3800704701496053e-05, + "loss": 1.2001, + "num_input_tokens_seen": 1864448, + "step": 708 + }, + { + "epoch": 1.264, + "grad_norm": 0.783041775226593, + "learning_rate": 2.33561861964635e-05, + "loss": 1.2348, + "num_input_tokens_seen": 1873680, + "step": 711 + }, + { + "epoch": 1.2693333333333334, + "grad_norm": 0.8268475532531738, + "learning_rate": 2.2914589485863014e-05, + "loss": 1.1127, + "num_input_tokens_seen": 1881744, + "step": 714 + }, + { + "epoch": 1.2746666666666666, + "grad_norm": 0.948457658290863, + "learning_rate": 2.247596299575022e-05, + "loss": 1.1406, + "num_input_tokens_seen": 1889808, + "step": 717 + }, + { + "epoch": 1.28, + "grad_norm": 0.7794213891029358, + "learning_rate": 2.2040354826462668e-05, + "loss": 1.1346, + "num_input_tokens_seen": 1896304, + "step": 720 + }, + { + "epoch": 1.2853333333333334, + "grad_norm": 0.9084338545799255, + "learning_rate": 2.160781274734495e-05, + "loss": 1.2673, + "num_input_tokens_seen": 1906672, + "step": 723 + }, + { + "epoch": 1.2906666666666666, + "grad_norm": 0.8108258843421936, + "learning_rate": 2.117838419151034e-05, + "loss": 1.1793, + "num_input_tokens_seen": 1913616, + "step": 726 + }, + { + "epoch": 1.296, + "grad_norm": 0.8344593644142151, + "learning_rate": 2.0752116250639225e-05, + "loss": 1.1043, + "num_input_tokens_seen": 1920944, + "step": 729 + }, + { + "epoch": 1.3013333333333335, + "grad_norm": 0.8615306615829468, + "learning_rate": 2.0329055669814934e-05, + "loss": 1.2762, + "num_input_tokens_seen": 1929216, + "step": 732 + }, + { + "epoch": 1.3066666666666666, + "grad_norm": 1.18500554561615, + "learning_rate": 1.9909248842397584e-05, + "loss": 1.02, + "num_input_tokens_seen": 1935792, + "step": 735 + }, + { + "epoch": 1.312, + "grad_norm": 1.018406867980957, + "learning_rate": 1.9492741804936622e-05, + "loss": 1.0995, + "num_input_tokens_seen": 1942752, + "step": 738 + }, + { + "epoch": 1.3173333333333335, + "grad_norm": 0.7798596620559692, + "learning_rate": 1.9079580232122303e-05, + "loss": 1.2871, + "num_input_tokens_seen": 1950528, + "step": 741 + }, + { + "epoch": 1.3226666666666667, + "grad_norm": 0.9787166714668274, + "learning_rate": 1.866980943177699e-05, + "loss": 1.171, + "num_input_tokens_seen": 1957392, + "step": 744 + }, + { + "epoch": 1.328, + "grad_norm": 0.6474674344062805, + "learning_rate": 1.8263474339886628e-05, + "loss": 1.2077, + "num_input_tokens_seen": 1966480, + "step": 747 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.035290002822876, + "learning_rate": 1.7860619515673033e-05, + "loss": 1.0823, + "num_input_tokens_seen": 1973968, + "step": 750 + }, + { + "epoch": 1.3386666666666667, + "grad_norm": 1.129233717918396, + "learning_rate": 1.746128913670746e-05, + "loss": 1.1719, + "num_input_tokens_seen": 1980416, + "step": 753 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.7446464896202087, + "learning_rate": 1.7065526994065973e-05, + "loss": 1.1616, + "num_input_tokens_seen": 1987808, + "step": 756 + }, + { + "epoch": 1.3493333333333333, + "grad_norm": 0.6898766160011292, + "learning_rate": 1.667337648752738e-05, + "loss": 1.2321, + "num_input_tokens_seen": 1995728, + "step": 759 + }, + { + "epoch": 1.3546666666666667, + "grad_norm": 0.7107200622558594, + "learning_rate": 1.6284880620813848e-05, + "loss": 1.4247, + "num_input_tokens_seen": 2004016, + "step": 762 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.6560705304145813, + "learning_rate": 1.5900081996875083e-05, + "loss": 1.1731, + "num_input_tokens_seen": 2012384, + "step": 765 + }, + { + "epoch": 1.3653333333333333, + "grad_norm": 0.6298875212669373, + "learning_rate": 1.551902281321651e-05, + "loss": 1.0719, + "num_input_tokens_seen": 2020096, + "step": 768 + }, + { + "epoch": 1.3706666666666667, + "grad_norm": 1.1242854595184326, + "learning_rate": 1.5141744857271778e-05, + "loss": 1.119, + "num_input_tokens_seen": 2027056, + "step": 771 + }, + { + "epoch": 1.376, + "grad_norm": 1.0423098802566528, + "learning_rate": 1.4768289501820265e-05, + "loss": 1.2802, + "num_input_tokens_seen": 2034256, + "step": 774 + }, + { + "epoch": 1.3813333333333333, + "grad_norm": 0.9645527601242065, + "learning_rate": 1.439869770045018e-05, + "loss": 1.172, + "num_input_tokens_seen": 2040768, + "step": 777 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 0.6331435441970825, + "learning_rate": 1.4033009983067452e-05, + "loss": 1.2637, + "num_input_tokens_seen": 2047648, + "step": 780 + }, + { + "epoch": 1.392, + "grad_norm": 1.2376108169555664, + "learning_rate": 1.367126645145121e-05, + "loss": 1.1766, + "num_input_tokens_seen": 2054848, + "step": 783 + }, + { + "epoch": 1.3973333333333333, + "grad_norm": 0.7504984736442566, + "learning_rate": 1.3313506774856177e-05, + "loss": 1.3347, + "num_input_tokens_seen": 2063744, + "step": 786 + }, + { + "epoch": 1.4026666666666667, + "grad_norm": 0.9626278877258301, + "learning_rate": 1.29597701856625e-05, + "loss": 1.2374, + "num_input_tokens_seen": 2072336, + "step": 789 + }, + { + "epoch": 1.408, + "grad_norm": 0.8408162593841553, + "learning_rate": 1.2610095475073414e-05, + "loss": 1.303, + "num_input_tokens_seen": 2080992, + "step": 792 + }, + { + "epoch": 1.4133333333333333, + "grad_norm": 1.0727360248565674, + "learning_rate": 1.22645209888614e-05, + "loss": 1.0784, + "num_input_tokens_seen": 2087728, + "step": 795 + }, + { + "epoch": 1.4186666666666667, + "grad_norm": 0.8654668927192688, + "learning_rate": 1.1923084623163172e-05, + "loss": 1.2539, + "num_input_tokens_seen": 2095968, + "step": 798 + }, + { + "epoch": 1.424, + "grad_norm": 0.7734363675117493, + "learning_rate": 1.1585823820323843e-05, + "loss": 1.2818, + "num_input_tokens_seen": 2103088, + "step": 801 + }, + { + "epoch": 1.4293333333333333, + "grad_norm": 0.9622248411178589, + "learning_rate": 1.1252775564791024e-05, + "loss": 1.143, + "num_input_tokens_seen": 2111408, + "step": 804 + }, + { + "epoch": 1.4346666666666668, + "grad_norm": 0.634290337562561, + "learning_rate": 1.0923976379059058e-05, + "loss": 1.233, + "num_input_tokens_seen": 2119632, + "step": 807 + }, + { + "epoch": 1.44, + "grad_norm": 0.9862900376319885, + "learning_rate": 1.0599462319663905e-05, + "loss": 0.9924, + "num_input_tokens_seen": 2127440, + "step": 810 + }, + { + "epoch": 1.4453333333333334, + "grad_norm": 1.0004602670669556, + "learning_rate": 1.0279268973229089e-05, + "loss": 1.2331, + "num_input_tokens_seen": 2137088, + "step": 813 + }, + { + "epoch": 1.4506666666666668, + "grad_norm": 0.6738834381103516, + "learning_rate": 9.963431452563332e-06, + "loss": 1.196, + "num_input_tokens_seen": 2144992, + "step": 816 + }, + { + "epoch": 1.456, + "grad_norm": 0.9389998912811279, + "learning_rate": 9.651984392809914e-06, + "loss": 1.0854, + "num_input_tokens_seen": 2153936, + "step": 819 + }, + { + "epoch": 1.4613333333333334, + "grad_norm": 0.8484275937080383, + "learning_rate": 9.344961947648623e-06, + "loss": 1.1659, + "num_input_tokens_seen": 2161760, + "step": 822 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.7993901968002319, + "learning_rate": 9.042397785550405e-06, + "loss": 1.1839, + "num_input_tokens_seen": 2168736, + "step": 825 + }, + { + "epoch": 1.472, + "grad_norm": 0.9979962110519409, + "learning_rate": 8.744325086085248e-06, + "loss": 1.2836, + "num_input_tokens_seen": 2175712, + "step": 828 + }, + { + "epoch": 1.4773333333333334, + "grad_norm": 1.106884479522705, + "learning_rate": 8.450776536283594e-06, + "loss": 1.1372, + "num_input_tokens_seen": 2182960, + "step": 831 + }, + { + "epoch": 1.4826666666666668, + "grad_norm": 0.8891430497169495, + "learning_rate": 8.16178432705192e-06, + "loss": 1.2549, + "num_input_tokens_seen": 2191232, + "step": 834 + }, + { + "epoch": 1.488, + "grad_norm": 0.680219829082489, + "learning_rate": 7.877380149642626e-06, + "loss": 1.1984, + "num_input_tokens_seen": 2200064, + "step": 837 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 1.0326625108718872, + "learning_rate": 7.597595192178702e-06, + "loss": 1.4679, + "num_input_tokens_seen": 2209072, + "step": 840 + }, + { + "epoch": 1.4986666666666666, + "grad_norm": 0.7626471519470215, + "learning_rate": 7.322460136233622e-06, + "loss": 1.3558, + "num_input_tokens_seen": 2216608, + "step": 843 + }, + { + "epoch": 1.504, + "grad_norm": 0.655614972114563, + "learning_rate": 7.052005153466779e-06, + "loss": 1.1688, + "num_input_tokens_seen": 2224784, + "step": 846 + }, + { + "epoch": 1.5093333333333332, + "grad_norm": 0.9603847861289978, + "learning_rate": 6.786259902314768e-06, + "loss": 1.0322, + "num_input_tokens_seen": 2232640, + "step": 849 + }, + { + "epoch": 1.5146666666666668, + "grad_norm": 0.846725583076477, + "learning_rate": 6.52525352473905e-06, + "loss": 1.1789, + "num_input_tokens_seen": 2241184, + "step": 852 + }, + { + "epoch": 1.52, + "grad_norm": 0.9454957246780396, + "learning_rate": 6.269014643030213e-06, + "loss": 1.2104, + "num_input_tokens_seen": 2248848, + "step": 855 + }, + { + "epoch": 1.5253333333333332, + "grad_norm": 0.6011205911636353, + "learning_rate": 6.017571356669183e-06, + "loss": 1.2657, + "num_input_tokens_seen": 2257808, + "step": 858 + }, + { + "epoch": 1.5306666666666666, + "grad_norm": 1.0003712177276611, + "learning_rate": 5.770951239245803e-06, + "loss": 1.1423, + "num_input_tokens_seen": 2264848, + "step": 861 + }, + { + "epoch": 1.536, + "grad_norm": 0.9697505831718445, + "learning_rate": 5.529181335435124e-06, + "loss": 1.0841, + "num_input_tokens_seen": 2271568, + "step": 864 + }, + { + "epoch": 1.5413333333333332, + "grad_norm": 0.7945210337638855, + "learning_rate": 5.292288158031594e-06, + "loss": 1.2021, + "num_input_tokens_seen": 2278592, + "step": 867 + }, + { + "epoch": 1.5466666666666666, + "grad_norm": 0.8583483099937439, + "learning_rate": 5.060297685041659e-06, + "loss": 1.0631, + "num_input_tokens_seen": 2286304, + "step": 870 + }, + { + "epoch": 1.552, + "grad_norm": 1.353039026260376, + "learning_rate": 4.833235356834959e-06, + "loss": 1.1668, + "num_input_tokens_seen": 2295040, + "step": 873 + }, + { + "epoch": 1.5573333333333332, + "grad_norm": 0.8499441146850586, + "learning_rate": 4.611126073354571e-06, + "loss": 1.2205, + "num_input_tokens_seen": 2304000, + "step": 876 + }, + { + "epoch": 1.5626666666666666, + "grad_norm": 1.1828577518463135, + "learning_rate": 4.3939941913863525e-06, + "loss": 1.1882, + "num_input_tokens_seen": 2310384, + "step": 879 + }, + { + "epoch": 1.568, + "grad_norm": 0.8698590993881226, + "learning_rate": 4.181863521888019e-06, + "loss": 1.2363, + "num_input_tokens_seen": 2317008, + "step": 882 + }, + { + "epoch": 1.5733333333333333, + "grad_norm": 1.1912041902542114, + "learning_rate": 3.974757327377981e-06, + "loss": 1.1589, + "num_input_tokens_seen": 2324752, + "step": 885 + }, + { + "epoch": 1.5786666666666667, + "grad_norm": 1.479888677597046, + "learning_rate": 3.772698319384349e-06, + "loss": 1.0814, + "num_input_tokens_seen": 2332416, + "step": 888 + }, + { + "epoch": 1.584, + "grad_norm": 0.7699785232543945, + "learning_rate": 3.575708655954324e-06, + "loss": 1.2078, + "num_input_tokens_seen": 2341936, + "step": 891 + }, + { + "epoch": 1.5893333333333333, + "grad_norm": 0.7506076097488403, + "learning_rate": 3.3838099392243916e-06, + "loss": 1.1234, + "num_input_tokens_seen": 2351968, + "step": 894 + }, + { + "epoch": 1.5946666666666667, + "grad_norm": 0.8103495836257935, + "learning_rate": 3.197023213051337e-06, + "loss": 1.2396, + "num_input_tokens_seen": 2361200, + "step": 897 + }, + { + "epoch": 1.6, + "grad_norm": 0.8596040606498718, + "learning_rate": 3.0153689607045845e-06, + "loss": 1.2116, + "num_input_tokens_seen": 2367984, + "step": 900 + }, + { + "epoch": 1.6053333333333333, + "grad_norm": 1.0933467149734497, + "learning_rate": 2.8388671026199522e-06, + "loss": 1.1365, + "num_input_tokens_seen": 2375360, + "step": 903 + }, + { + "epoch": 1.6106666666666667, + "grad_norm": 0.8759250044822693, + "learning_rate": 2.667536994215186e-06, + "loss": 1.2196, + "num_input_tokens_seen": 2383344, + "step": 906 + }, + { + "epoch": 1.616, + "grad_norm": 0.8788615465164185, + "learning_rate": 2.501397423767382e-06, + "loss": 1.2588, + "num_input_tokens_seen": 2390464, + "step": 909 + }, + { + "epoch": 1.6213333333333333, + "grad_norm": 0.8420098423957825, + "learning_rate": 2.340466610352654e-06, + "loss": 1.195, + "num_input_tokens_seen": 2398736, + "step": 912 + }, + { + "epoch": 1.6266666666666667, + "grad_norm": 0.8773792386054993, + "learning_rate": 2.1847622018482283e-06, + "loss": 1.2392, + "num_input_tokens_seen": 2406864, + "step": 915 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 1.1017309427261353, + "learning_rate": 2.0343012729971243e-06, + "loss": 1.104, + "num_input_tokens_seen": 2414304, + "step": 918 + }, + { + "epoch": 1.6373333333333333, + "grad_norm": 1.108170986175537, + "learning_rate": 1.8891003235357308e-06, + "loss": 1.1601, + "num_input_tokens_seen": 2421648, + "step": 921 + }, + { + "epoch": 1.6426666666666667, + "grad_norm": 0.9205071330070496, + "learning_rate": 1.7491752763844293e-06, + "loss": 1.2597, + "num_input_tokens_seen": 2429856, + "step": 924 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.7994181513786316, + "learning_rate": 1.6145414759014431e-06, + "loss": 1.0914, + "num_input_tokens_seen": 2437840, + "step": 927 + }, + { + "epoch": 1.6533333333333333, + "grad_norm": 0.8237050771713257, + "learning_rate": 1.4852136862001764e-06, + "loss": 1.1627, + "num_input_tokens_seen": 2446560, + "step": 930 + }, + { + "epoch": 1.6586666666666665, + "grad_norm": 1.0996540784835815, + "learning_rate": 1.3612060895301759e-06, + "loss": 1.1386, + "num_input_tokens_seen": 2453360, + "step": 933 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.7453559637069702, + "learning_rate": 1.2425322847218368e-06, + "loss": 1.29, + "num_input_tokens_seen": 2461600, + "step": 936 + }, + { + "epoch": 1.6693333333333333, + "grad_norm": 1.20688796043396, + "learning_rate": 1.1292052856952062e-06, + "loss": 1.0164, + "num_input_tokens_seen": 2469168, + "step": 939 + }, + { + "epoch": 1.6746666666666665, + "grad_norm": 0.9430545568466187, + "learning_rate": 1.0212375200327973e-06, + "loss": 1.0143, + "num_input_tokens_seen": 2476688, + "step": 942 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.8898412585258484, + "learning_rate": 9.186408276168013e-07, + "loss": 1.1762, + "num_input_tokens_seen": 2485136, + "step": 945 + }, + { + "epoch": 1.6853333333333333, + "grad_norm": 1.0093709230422974, + "learning_rate": 8.214264593307098e-07, + "loss": 1.2691, + "num_input_tokens_seen": 2491568, + "step": 948 + }, + { + "epoch": 1.6906666666666665, + "grad_norm": 1.4375550746917725, + "learning_rate": 7.296050758254957e-07, + "loss": 1.3668, + "num_input_tokens_seen": 2499312, + "step": 951 + }, + { + "epoch": 1.696, + "grad_norm": 0.8531755208969116, + "learning_rate": 6.431867463506048e-07, + "loss": 1.3101, + "num_input_tokens_seen": 2506160, + "step": 954 + }, + { + "epoch": 1.7013333333333334, + "grad_norm": 0.8654898405075073, + "learning_rate": 5.621809476497098e-07, + "loss": 1.3651, + "num_input_tokens_seen": 2514000, + "step": 957 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 0.8185416460037231, + "learning_rate": 4.865965629214819e-07, + "loss": 1.0573, + "num_input_tokens_seen": 2520416, + "step": 960 + }, + { + "epoch": 1.712, + "grad_norm": 0.7298229932785034, + "learning_rate": 4.1644188084548063e-07, + "loss": 1.3964, + "num_input_tokens_seen": 2528112, + "step": 963 + }, + { + "epoch": 1.7173333333333334, + "grad_norm": 0.8970651030540466, + "learning_rate": 3.517245946731529e-07, + "loss": 1.1651, + "num_input_tokens_seen": 2535360, + "step": 966 + }, + { + "epoch": 1.7226666666666666, + "grad_norm": 0.6972456574440002, + "learning_rate": 2.924518013842303e-07, + "loss": 1.1533, + "num_input_tokens_seen": 2543792, + "step": 969 + }, + { + "epoch": 1.728, + "grad_norm": 0.727842390537262, + "learning_rate": 2.386300009084408e-07, + "loss": 1.2354, + "num_input_tokens_seen": 2553136, + "step": 972 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.913786768913269, + "learning_rate": 1.9026509541272275e-07, + "loss": 1.1855, + "num_input_tokens_seen": 2561840, + "step": 975 + }, + { + "epoch": 1.7386666666666666, + "grad_norm": 0.9313849210739136, + "learning_rate": 1.4736238865398765e-07, + "loss": 0.9797, + "num_input_tokens_seen": 2570032, + "step": 978 + }, + { + "epoch": 1.744, + "grad_norm": 0.8113420009613037, + "learning_rate": 1.0992658539750178e-07, + "loss": 1.1833, + "num_input_tokens_seen": 2577888, + "step": 981 + }, + { + "epoch": 1.7493333333333334, + "grad_norm": 0.7111260294914246, + "learning_rate": 7.796179090094891e-08, + "loss": 1.3613, + "num_input_tokens_seen": 2586448, + "step": 984 + }, + { + "epoch": 1.7546666666666666, + "grad_norm": 0.909915566444397, + "learning_rate": 5.1471510464268236e-08, + "loss": 1.1256, + "num_input_tokens_seen": 2593744, + "step": 987 + }, + { + "epoch": 1.76, + "grad_norm": 0.8057591915130615, + "learning_rate": 3.04586490452119e-08, + "loss": 1.1567, + "num_input_tokens_seen": 2602048, + "step": 990 + }, + { + "epoch": 1.7653333333333334, + "grad_norm": 0.9321501851081848, + "learning_rate": 1.4925510940844156e-08, + "loss": 1.302, + "num_input_tokens_seen": 2610160, + "step": 993 + }, + { + "epoch": 1.7706666666666666, + "grad_norm": 1.0042476654052734, + "learning_rate": 4.873799534788059e-09, + "loss": 1.21, + "num_input_tokens_seen": 2617616, + "step": 996 + }, + { + "epoch": 1.776, + "grad_norm": 1.1939113140106201, + "learning_rate": 3.0461711048035415e-10, + "loss": 1.0737, + "num_input_tokens_seen": 2624416, + "step": 999 + }, + { + "epoch": 1.7777777777777777, + "eval_loss": 1.258375644683838, + "eval_runtime": 22.1004, + "eval_samples_per_second": 45.248, + "eval_steps_per_second": 22.624, + "num_input_tokens_seen": 2626832, + "step": 1000 + }, + { + "epoch": 1.7777777777777777, + "num_input_tokens_seen": 2626832, + "step": 1000, + "total_flos": 1.0648609969392845e+17, + "train_loss": 1.2841649515628815, + "train_runtime": 1377.1674, + "train_samples_per_second": 11.618, + "train_steps_per_second": 0.726 + } + ], + "logging_steps": 3, + "max_steps": 1000, + "num_input_tokens_seen": 2626832, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0648609969392845e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/training_args.bin b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/training_args.bin new file mode 100644 index 00000000..2b317f6f Binary files /dev/null and b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/training_args.bin differ diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/training_eval_loss.png b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/training_eval_loss.png new file mode 100644 index 00000000..840caf96 Binary files /dev/null and b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/training_eval_loss.png differ diff --git a/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/training_loss.png b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/training_loss.png new file mode 100644 index 00000000..4f84cfa4 Binary files /dev/null and b/results/lora_sft/Baichuan2-7B/Baichuan2_lora_sft_1_single/training_loss.png differ