diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1.txt b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1.txt new file mode 100644 index 00000000..094bd602 --- /dev/null +++ b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1.txt @@ -0,0 +1,447 @@ +08/20/2024 08:33:03 - INFO - llamafactory.cli - Initializing distributed tasks at: 127.0.0.1:24071 +08/20/2024 08:33:11 - WARNING - llamafactory.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +08/20/2024 08:33:11 - INFO - llamafactory.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +08/20/2024 08:33:11 - INFO - llamafactory.data.template - Cannot add this chat template to tokenizer. +08/20/2024 08:33:11 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_1M_CN... +08/20/2024 08:33:12 - WARNING - llamafactory.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +08/20/2024 08:33:12 - INFO - llamafactory.hparams.parser - Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +08/20/2024 08:33:12 - INFO - llamafactory.data.template - Cannot add this chat template to tokenizer. +08/20/2024 08:33:12 - WARNING - llamafactory.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +08/20/2024 08:33:12 - INFO - llamafactory.hparams.parser - Process rank: 6, device: cuda:6, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +08/20/2024 08:33:12 - INFO - llamafactory.data.template - Cannot add this chat template to tokenizer. +08/20/2024 08:33:13 - WARNING - llamafactory.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +08/20/2024 08:33:13 - INFO - llamafactory.hparams.parser - Process rank: 3, device: cuda:3, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +08/20/2024 08:33:13 - WARNING - llamafactory.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +08/20/2024 08:33:13 - INFO - llamafactory.hparams.parser - Process rank: 4, device: cuda:4, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +08/20/2024 08:33:13 - INFO - llamafactory.data.template - Cannot add this chat template to tokenizer. +08/20/2024 08:33:13 - INFO - llamafactory.data.template - Cannot add this chat template to tokenizer. +08/20/2024 08:33:13 - WARNING - llamafactory.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +08/20/2024 08:33:13 - INFO - llamafactory.hparams.parser - Process rank: 2, device: cuda:2, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +08/20/2024 08:33:13 - INFO - llamafactory.data.template - Cannot add this chat template to tokenizer. +08/20/2024 08:33:13 - WARNING - llamafactory.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. +08/20/2024 08:33:13 - INFO - llamafactory.hparams.parser - Process rank: 5, device: cuda:5, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 +08/20/2024 08:33:13 - INFO - llamafactory.data.template - Cannot add this chat template to tokenizer. +08/20/2024 08:34:17 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_1M_CN... +08/20/2024 08:34:17 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_1M_CN... +08/20/2024 08:34:17 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_1M_CN... +08/20/2024 08:34:17 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_1M_CN... +08/20/2024 08:34:17 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_1M_CN... +08/20/2024 08:34:17 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_1M_CN... +training example: +input_ids: +[64790, 64792, 790, 30951, 517, 30910, 30940, 30996, 13, 13, 54761, 31211, 33448, 54840, 32081, 32146, 32034, 32363, 46177, 33257, 31155, 31654, 37725, 31123, 55073, 31692, 33608, 32108, 31155, 13, 33182, 31690, 35868, 38334, 54868, 30954, 449, 31796, 35078, 31646, 32057, 31123, 32827, 54687, 31692, 34559, 31786, 54542, 48763, 31123, 54548, 32510, 31773, 33476, 37863, 13, 13, 13, 55437, 31211, 50669, 44667, 32363, 46177, 33257, 31155, 2] +inputs: +[Round 0] + +问:判断给定的文章是否符合语法规则。如果不符合,请提供修改建议。 +下面是一篇文章的开头: "为了探讨这个主题,本文将提供一系列数据和实例,以证明这一观点。" + + +答: 这个开头符合语法规则。 +label_ids: +[2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 50669, 44667, 32363, 46177, 33257, 31155, 2] +labels: +这个开头符合语法规则。 +08/20/2024 08:35:52 - WARNING - llamafactory.model.model_utils.checkpointing - You are using the old GC format, some features (e.g. BAdam) will be invalid. +08/20/2024 08:35:52 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled. +08/20/2024 08:35:52 - INFO - llamafactory.model.model_utils.attention - Using vanilla attention implementation. +08/20/2024 08:35:52 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32. +08/20/2024 08:35:52 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA +08/20/2024 08:35:52 - INFO - llamafactory.model.model_utils.misc - Found linear modules: dense_h_to_4h,query_key_value,dense,dense_4h_to_h +08/20/2024 08:35:52 - INFO - llamafactory.model.loader - trainable params: 14,823,424 || all params: 6,258,407,424 || trainable%: 0.2369 +08/20/2024 08:35:52 - WARNING - llamafactory.model.model_utils.checkpointing - You are using the old GC format, some features (e.g. BAdam) will be invalid. +08/20/2024 08:35:52 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled. +08/20/2024 08:35:52 - INFO - llamafactory.model.model_utils.attention - Using vanilla attention implementation. +08/20/2024 08:35:52 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32. +08/20/2024 08:35:52 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA +08/20/2024 08:35:52 - INFO - llamafactory.model.model_utils.misc - Found linear modules: query_key_value,dense,dense_h_to_4h,dense_4h_to_h +08/20/2024 08:35:53 - INFO - llamafactory.model.loader - trainable params: 14,823,424 || all params: 6,258,407,424 || trainable%: 0.2369 +08/20/2024 08:35:53 - WARNING - llamafactory.model.model_utils.checkpointing - You are using the old GC format, some features (e.g. BAdam) will be invalid. +08/20/2024 08:35:53 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled. +08/20/2024 08:35:53 - INFO - llamafactory.model.model_utils.attention - Using vanilla attention implementation. +08/20/2024 08:35:53 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32. +08/20/2024 08:35:53 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA +08/20/2024 08:35:53 - WARNING - llamafactory.model.model_utils.checkpointing - You are using the old GC format, some features (e.g. BAdam) will be invalid. +08/20/2024 08:35:53 - INFO - llamafactory.model.model_utils.misc - Found linear modules: query_key_value,dense_4h_to_h,dense,dense_h_to_4h +08/20/2024 08:35:53 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled. +08/20/2024 08:35:53 - INFO - llamafactory.model.model_utils.attention - Using vanilla attention implementation. +08/20/2024 08:35:53 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32. +08/20/2024 08:35:53 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA +08/20/2024 08:35:53 - INFO - llamafactory.model.model_utils.misc - Found linear modules: dense_4h_to_h,dense,query_key_value,dense_h_to_4h +08/20/2024 08:35:53 - INFO - llamafactory.model.loader - trainable params: 14,823,424 || all params: 6,258,407,424 || trainable%: 0.2369 +08/20/2024 08:35:53 - INFO - llamafactory.model.loader - trainable params: 14,823,424 || all params: 6,258,407,424 || trainable%: 0.2369 +08/20/2024 08:35:53 - WARNING - llamafactory.model.model_utils.checkpointing - You are using the old GC format, some features (e.g. BAdam) will be invalid. +08/20/2024 08:35:53 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled. +08/20/2024 08:35:53 - INFO - llamafactory.model.model_utils.attention - Using vanilla attention implementation. +08/20/2024 08:35:53 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32. +08/20/2024 08:35:53 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA +08/20/2024 08:35:53 - INFO - llamafactory.model.model_utils.misc - Found linear modules: dense_4h_to_h,query_key_value,dense_h_to_4h,dense +08/20/2024 08:35:53 - INFO - llamafactory.model.loader - trainable params: 14,823,424 || all params: 6,258,407,424 || trainable%: 0.2369 +08/20/2024 08:35:53 - WARNING - llamafactory.model.model_utils.checkpointing - You are using the old GC format, some features (e.g. BAdam) will be invalid. +08/20/2024 08:35:53 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled. +08/20/2024 08:35:53 - INFO - llamafactory.model.model_utils.attention - Using vanilla attention implementation. +08/20/2024 08:35:53 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32. +08/20/2024 08:35:53 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA +08/20/2024 08:35:53 - INFO - llamafactory.model.model_utils.misc - Found linear modules: dense,dense_h_to_4h,dense_4h_to_h,query_key_value +08/20/2024 08:35:53 - WARNING - llamafactory.model.model_utils.checkpointing - You are using the old GC format, some features (e.g. BAdam) will be invalid. +08/20/2024 08:35:53 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled. +08/20/2024 08:35:53 - INFO - llamafactory.model.model_utils.attention - Using vanilla attention implementation. +08/20/2024 08:35:53 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32. +08/20/2024 08:35:53 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA +08/20/2024 08:35:53 - INFO - llamafactory.model.model_utils.misc - Found linear modules: query_key_value,dense_h_to_4h,dense_4h_to_h,dense +08/20/2024 08:35:54 - INFO - llamafactory.model.loader - trainable params: 14,823,424 || all params: 6,258,407,424 || trainable%: 0.2369 +08/20/2024 08:35:54 - INFO - llamafactory.model.loader - trainable params: 14,823,424 || all params: 6,258,407,424 || trainable%: 0.2369 +{'loss': 2.6491, 'grad_norm': 0.45354562997817993, 'learning_rate': 3e-06, 'epoch': 0.04, 'num_input_tokens_seen': 59360} +{'loss': 2.714, 'grad_norm': 0.492897093296051, 'learning_rate': 6e-06, 'epoch': 0.07, 'num_input_tokens_seen': 110224} +{'loss': 2.8832, 'grad_norm': 0.4439088702201843, 'learning_rate': 9e-06, 'epoch': 0.11, 'num_input_tokens_seen': 162416} +{'loss': 2.8293, 'grad_norm': 0.4829657971858978, 'learning_rate': 1.2e-05, 'epoch': 0.15, 'num_input_tokens_seen': 215648} +{'loss': 2.7404, 'grad_norm': 0.5115702748298645, 'learning_rate': 1.5e-05, 'epoch': 0.19, 'num_input_tokens_seen': 270528} +{'loss': 2.7317, 'grad_norm': 0.5357673168182373, 'learning_rate': 1.8e-05, 'epoch': 0.22, 'num_input_tokens_seen': 320944} +{'loss': 2.7431, 'grad_norm': 0.4877207577228546, 'learning_rate': 2.1e-05, 'epoch': 0.26, 'num_input_tokens_seen': 377696} +{'loss': 2.5594, 'grad_norm': 0.5688062310218811, 'learning_rate': 2.4e-05, 'epoch': 0.3, 'num_input_tokens_seen': 428048} +{'loss': 2.7774, 'grad_norm': 0.5852718949317932, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.34, 'num_input_tokens_seen': 485680} +{'loss': 2.7325, 'grad_norm': 0.626228928565979, 'learning_rate': 3e-05, 'epoch': 0.37, 'num_input_tokens_seen': 538704} +{'loss': 2.6304, 'grad_norm': 0.7254530787467957, 'learning_rate': 3.3e-05, 'epoch': 0.41, 'num_input_tokens_seen': 590112} +{'loss': 2.6643, 'grad_norm': 0.7166836261749268, 'learning_rate': 3.6e-05, 'epoch': 0.45, 'num_input_tokens_seen': 645360} +{'loss': 2.5223, 'grad_norm': 0.8263206481933594, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.49, 'num_input_tokens_seen': 699664} +{'loss': 2.3975, 'grad_norm': 1.0774929523468018, 'learning_rate': 4.2e-05, 'epoch': 0.52, 'num_input_tokens_seen': 752640} +{'loss': 2.3851, 'grad_norm': 1.4689879417419434, 'learning_rate': 4.5e-05, 'epoch': 0.56, 'num_input_tokens_seen': 809488} +{'loss': 2.1509, 'grad_norm': 1.6804075241088867, 'learning_rate': 4.8e-05, 'epoch': 0.6, 'num_input_tokens_seen': 866016} +{'loss': 1.9498, 'grad_norm': 1.5407774448394775, 'learning_rate': 5.1000000000000006e-05, 'epoch': 0.63, 'num_input_tokens_seen': 922160} +{'loss': 1.863, 'grad_norm': 0.9005318880081177, 'learning_rate': 5.4000000000000005e-05, 'epoch': 0.67, 'num_input_tokens_seen': 976400} +{'loss': 1.6358, 'grad_norm': 0.4560866355895996, 'learning_rate': 5.6999999999999996e-05, 'epoch': 0.71, 'num_input_tokens_seen': 1028864} +{'loss': 1.6125, 'grad_norm': 0.4046150743961334, 'learning_rate': 6e-05, 'epoch': 0.75, 'num_input_tokens_seen': 1082896} +{'loss': 1.7412, 'grad_norm': 0.40974393486976624, 'learning_rate': 6.3e-05, 'epoch': 0.78, 'num_input_tokens_seen': 1141856} +{'loss': 1.6413, 'grad_norm': 0.38810229301452637, 'learning_rate': 6.6e-05, 'epoch': 0.82, 'num_input_tokens_seen': 1196976} +{'loss': 1.6965, 'grad_norm': 0.3670073449611664, 'learning_rate': 6.9e-05, 'epoch': 0.86, 'num_input_tokens_seen': 1249696} +{'loss': 1.623, 'grad_norm': 0.3049280345439911, 'learning_rate': 7.2e-05, 'epoch': 0.9, 'num_input_tokens_seen': 1304880} +{'loss': 1.5551, 'grad_norm': 0.2850935161113739, 'learning_rate': 7.500000000000001e-05, 'epoch': 0.93, 'num_input_tokens_seen': 1359776} +{'loss': 1.5815, 'grad_norm': 0.262834370136261, 'learning_rate': 7.800000000000001e-05, 'epoch': 0.97, 'num_input_tokens_seen': 1415584} +{'loss': 1.5404, 'grad_norm': 0.24694491922855377, 'learning_rate': 8.1e-05, 'epoch': 1.01, 'num_input_tokens_seen': 1468720} +{'loss': 1.5862, 'grad_norm': 0.24158701300621033, 'learning_rate': 8.4e-05, 'epoch': 1.05, 'num_input_tokens_seen': 1525872} +{'loss': 1.6087, 'grad_norm': 0.2713283598423004, 'learning_rate': 8.7e-05, 'epoch': 1.08, 'num_input_tokens_seen': 1577456} +{'loss': 1.5819, 'grad_norm': 0.23512157797813416, 'learning_rate': 9e-05, 'epoch': 1.12, 'num_input_tokens_seen': 1631152} +{'loss': 1.6023, 'grad_norm': 0.24216856062412262, 'learning_rate': 9.300000000000001e-05, 'epoch': 1.16, 'num_input_tokens_seen': 1692128} +{'loss': 1.5937, 'grad_norm': 0.22018860280513763, 'learning_rate': 9.6e-05, 'epoch': 1.19, 'num_input_tokens_seen': 1744368} +{'loss': 1.588, 'grad_norm': 0.26621294021606445, 'learning_rate': 9.900000000000001e-05, 'epoch': 1.23, 'num_input_tokens_seen': 1795904} +{'loss': 1.5538, 'grad_norm': 0.18780255317687988, 'learning_rate': 9.999878153526974e-05, 'epoch': 1.27, 'num_input_tokens_seen': 1851744} +{'loss': 1.5003, 'grad_norm': 0.25466451048851013, 'learning_rate': 9.999238475781957e-05, 'epoch': 1.31, 'num_input_tokens_seen': 1904912} +{'loss': 1.5044, 'grad_norm': 0.2715682089328766, 'learning_rate': 9.998050575201771e-05, 'epoch': 1.34, 'num_input_tokens_seen': 1956592} +{'loss': 1.5709, 'grad_norm': 0.3099578320980072, 'learning_rate': 9.996314582053106e-05, 'epoch': 1.38, 'num_input_tokens_seen': 2008672} +{'loss': 1.5742, 'grad_norm': 0.23790551722049713, 'learning_rate': 9.99403068670717e-05, 'epoch': 1.42, 'num_input_tokens_seen': 2063056} +{'loss': 1.4958, 'grad_norm': 0.23921732604503632, 'learning_rate': 9.991199139618827e-05, 'epoch': 1.46, 'num_input_tokens_seen': 2120000} +{'loss': 1.5715, 'grad_norm': 0.23010362684726715, 'learning_rate': 9.987820251299122e-05, 'epoch': 1.49, 'num_input_tokens_seen': 2173760} +{'loss': 1.5582, 'grad_norm': 0.21609963476657867, 'learning_rate': 9.983894392281237e-05, 'epoch': 1.53, 'num_input_tokens_seen': 2228944} +{'loss': 1.4904, 'grad_norm': 0.28609582781791687, 'learning_rate': 9.979421993079852e-05, 'epoch': 1.57, 'num_input_tokens_seen': 2280544} +{'loss': 1.4941, 'grad_norm': 0.2686341404914856, 'learning_rate': 9.974403544143941e-05, 'epoch': 1.6, 'num_input_tokens_seen': 2337536} +{'loss': 1.5594, 'grad_norm': 0.3083486557006836, 'learning_rate': 9.968839595802982e-05, 'epoch': 1.64, 'num_input_tokens_seen': 2391008} +{'loss': 1.45, 'grad_norm': 0.2757379114627838, 'learning_rate': 9.962730758206611e-05, 'epoch': 1.68, 'num_input_tokens_seen': 2445312} +{'loss': 1.6545, 'grad_norm': 0.3191740810871124, 'learning_rate': 9.956077701257709e-05, 'epoch': 1.72, 'num_input_tokens_seen': 2495616} +{'loss': 1.5083, 'grad_norm': 0.2461022436618805, 'learning_rate': 9.948881154538945e-05, 'epoch': 1.75, 'num_input_tokens_seen': 2553136} +{'loss': 1.511, 'grad_norm': 0.2785870134830475, 'learning_rate': 9.941141907232765e-05, 'epoch': 1.79, 'num_input_tokens_seen': 2607888} +{'loss': 1.4864, 'grad_norm': 0.2595404088497162, 'learning_rate': 9.932860808034848e-05, 'epoch': 1.83, 'num_input_tokens_seen': 2662928} +{'loss': 1.4787, 'grad_norm': 0.3087688982486725, 'learning_rate': 9.924038765061042e-05, 'epoch': 1.87, 'num_input_tokens_seen': 2719936} +{'loss': 1.4957, 'grad_norm': 0.2820245325565338, 'learning_rate': 9.914676745747772e-05, 'epoch': 1.9, 'num_input_tokens_seen': 2774928} +{'loss': 1.5388, 'grad_norm': 0.29079610109329224, 'learning_rate': 9.904775776745958e-05, 'epoch': 1.94, 'num_input_tokens_seen': 2831584} +{'loss': 1.4156, 'grad_norm': 0.32320481538772583, 'learning_rate': 9.894336943808426e-05, 'epoch': 1.98, 'num_input_tokens_seen': 2886528} +{'loss': 1.4933, 'grad_norm': 0.31004923582077026, 'learning_rate': 9.88336139167084e-05, 'epoch': 2.02, 'num_input_tokens_seen': 2944048} +{'loss': 1.4981, 'grad_norm': 0.2879312336444855, 'learning_rate': 9.871850323926177e-05, 'epoch': 2.05, 'num_input_tokens_seen': 2996624} +{'loss': 1.5245, 'grad_norm': 0.32446593046188354, 'learning_rate': 9.859805002892732e-05, 'epoch': 2.09, 'num_input_tokens_seen': 3050672} +{'loss': 1.4546, 'grad_norm': 0.38426050543785095, 'learning_rate': 9.847226749475695e-05, 'epoch': 2.13, 'num_input_tokens_seen': 3101648} +{'loss': 1.4426, 'grad_norm': 0.31284961104393005, 'learning_rate': 9.834116943022298e-05, 'epoch': 2.16, 'num_input_tokens_seen': 3158912} +{'loss': 1.4767, 'grad_norm': 0.3530096709728241, 'learning_rate': 9.820477021170551e-05, 'epoch': 2.2, 'num_input_tokens_seen': 3215280} +{'loss': 1.4652, 'grad_norm': 0.34290698170661926, 'learning_rate': 9.806308479691595e-05, 'epoch': 2.24, 'num_input_tokens_seen': 3270144} +{'loss': 1.451, 'grad_norm': 0.30220866203308105, 'learning_rate': 9.791612872325667e-05, 'epoch': 2.28, 'num_input_tokens_seen': 3326128} +{'loss': 1.4005, 'grad_norm': 0.3330557644367218, 'learning_rate': 9.776391810611718e-05, 'epoch': 2.31, 'num_input_tokens_seen': 3382096} +{'loss': 1.5281, 'grad_norm': 0.35981714725494385, 'learning_rate': 9.760646963710694e-05, 'epoch': 2.35, 'num_input_tokens_seen': 3432624} +{'loss': 1.4811, 'grad_norm': 0.41824910044670105, 'learning_rate': 9.744380058222483e-05, 'epoch': 2.39, 'num_input_tokens_seen': 3485216} +{'loss': 1.4439, 'grad_norm': 0.3749221861362457, 'learning_rate': 9.727592877996585e-05, 'epoch': 2.43, 'num_input_tokens_seen': 3536224} +{'loss': 1.4742, 'grad_norm': 0.41196370124816895, 'learning_rate': 9.710287263936484e-05, 'epoch': 2.46, 'num_input_tokens_seen': 3588128} +{'loss': 1.4419, 'grad_norm': 0.4999885559082031, 'learning_rate': 9.69246511379778e-05, 'epoch': 2.5, 'num_input_tokens_seen': 3644256} +{'loss': 1.4283, 'grad_norm': 0.4147838354110718, 'learning_rate': 9.674128381980072e-05, 'epoch': 2.54, 'num_input_tokens_seen': 3697680} +{'loss': 1.4107, 'grad_norm': 0.3952224850654602, 'learning_rate': 9.655279079312642e-05, 'epoch': 2.58, 'num_input_tokens_seen': 3756608} +{'loss': 1.4162, 'grad_norm': 0.3909386694431305, 'learning_rate': 9.635919272833938e-05, 'epoch': 2.61, 'num_input_tokens_seen': 3811632} +{'loss': 1.5205, 'grad_norm': 0.4344032406806946, 'learning_rate': 9.616051085564906e-05, 'epoch': 2.65, 'num_input_tokens_seen': 3870112} +{'loss': 1.5375, 'grad_norm': 0.49284985661506653, 'learning_rate': 9.595676696276172e-05, 'epoch': 2.69, 'num_input_tokens_seen': 3926352} +{'loss': 1.4106, 'grad_norm': 0.42042234539985657, 'learning_rate': 9.574798339249125e-05, 'epoch': 2.72, 'num_input_tokens_seen': 3980768} +{'loss': 1.4377, 'grad_norm': 0.5457227826118469, 'learning_rate': 9.553418304030886e-05, 'epoch': 2.76, 'num_input_tokens_seen': 4029792} +{'loss': 1.4638, 'grad_norm': 0.4127957820892334, 'learning_rate': 9.53153893518325e-05, 'epoch': 2.8, 'num_input_tokens_seen': 4087248} +{'loss': 1.469, 'grad_norm': 0.4777499735355377, 'learning_rate': 9.50916263202557e-05, 'epoch': 2.84, 'num_input_tokens_seen': 4142912} +{'loss': 1.4436, 'grad_norm': 0.4768824875354767, 'learning_rate': 9.486291848371643e-05, 'epoch': 2.87, 'num_input_tokens_seen': 4198272} +{'loss': 1.4491, 'grad_norm': 0.4727541506290436, 'learning_rate': 9.462929092260628e-05, 'epoch': 2.91, 'num_input_tokens_seen': 4252528} +{'loss': 1.4245, 'grad_norm': 0.46310773491859436, 'learning_rate': 9.439076925682006e-05, 'epoch': 2.95, 'num_input_tokens_seen': 4305472} +{'loss': 1.5656, 'grad_norm': 0.554982602596283, 'learning_rate': 9.414737964294636e-05, 'epoch': 2.99, 'num_input_tokens_seen': 4360608} +{'loss': 1.362, 'grad_norm': 0.4734109342098236, 'learning_rate': 9.389914877139903e-05, 'epoch': 3.02, 'num_input_tokens_seen': 4418496} +{'loss': 1.4078, 'grad_norm': 0.5144374966621399, 'learning_rate': 9.364610386349049e-05, 'epoch': 3.06, 'num_input_tokens_seen': 4471104} +{'loss': 1.3074, 'grad_norm': 0.4901665449142456, 'learning_rate': 9.338827266844644e-05, 'epoch': 3.1, 'num_input_tokens_seen': 4522432} +{'loss': 1.379, 'grad_norm': 0.4994266629219055, 'learning_rate': 9.312568346036288e-05, 'epoch': 3.14, 'num_input_tokens_seen': 4577216} +{'loss': 1.3311, 'grad_norm': 0.5738364458084106, 'learning_rate': 9.285836503510562e-05, 'epoch': 3.17, 'num_input_tokens_seen': 4628832} +{'loss': 1.3993, 'grad_norm': 0.5080900192260742, 'learning_rate': 9.258634670715238e-05, 'epoch': 3.21, 'num_input_tokens_seen': 4683312} +{'loss': 1.3511, 'grad_norm': 0.5958444476127625, 'learning_rate': 9.230965830637821e-05, 'epoch': 3.25, 'num_input_tokens_seen': 4739792} +{'loss': 1.3009, 'grad_norm': 0.7206479907035828, 'learning_rate': 9.202833017478422e-05, 'epoch': 3.28, 'num_input_tokens_seen': 4791312} +{'loss': 1.3149, 'grad_norm': 0.6154108643531799, 'learning_rate': 9.174239316317033e-05, 'epoch': 3.32, 'num_input_tokens_seen': 4845744} +{'loss': 1.3126, 'grad_norm': 0.7271692156791687, 'learning_rate': 9.145187862775209e-05, 'epoch': 3.36, 'num_input_tokens_seen': 4897472} +{'loss': 1.341, 'grad_norm': 0.6346867680549622, 'learning_rate': 9.11568184267221e-05, 'epoch': 3.4, 'num_input_tokens_seen': 4954272} +{'loss': 1.2816, 'grad_norm': 0.686673104763031, 'learning_rate': 9.085724491675642e-05, 'epoch': 3.43, 'num_input_tokens_seen': 5010272} +{'loss': 1.3563, 'grad_norm': 0.6660853624343872, 'learning_rate': 9.055319094946633e-05, 'epoch': 3.47, 'num_input_tokens_seen': 5066032} +{'loss': 1.3326, 'grad_norm': 0.698714554309845, 'learning_rate': 9.02446898677957e-05, 'epoch': 3.51, 'num_input_tokens_seen': 5115488} +{'loss': 1.4707, 'grad_norm': 0.7857383489608765, 'learning_rate': 8.993177550236464e-05, 'epoch': 3.55, 'num_input_tokens_seen': 5174736} +{'loss': 1.2941, 'grad_norm': 0.7507392764091492, 'learning_rate': 8.961448216775954e-05, 'epoch': 3.58, 'num_input_tokens_seen': 5230016} +{'loss': 1.3435, 'grad_norm': 0.7710234522819519, 'learning_rate': 8.92928446587701e-05, 'epoch': 3.62, 'num_input_tokens_seen': 5289728} +{'loss': 1.3954, 'grad_norm': 0.7039404511451721, 'learning_rate': 8.896689824657372e-05, 'epoch': 3.66, 'num_input_tokens_seen': 5344176} +{'loss': 1.2821, 'grad_norm': 0.8286743760108948, 'learning_rate': 8.863667867486756e-05, 'epoch': 3.7, 'num_input_tokens_seen': 5396144} +{'loss': 1.3893, 'grad_norm': 0.8500336408615112, 'learning_rate': 8.83022221559489e-05, 'epoch': 3.73, 'num_input_tokens_seen': 5448672} +{'loss': 1.3796, 'grad_norm': 0.7117451429367065, 'learning_rate': 8.796356536674403e-05, 'epoch': 3.77, 'num_input_tokens_seen': 5503696} +{'loss': 1.4454, 'grad_norm': 0.7383677959442139, 'learning_rate': 8.762074544478623e-05, 'epoch': 3.81, 'num_input_tokens_seen': 5557888} +{'loss': 1.3422, 'grad_norm': 0.8263104557991028, 'learning_rate': 8.727379998414311e-05, 'epoch': 3.84, 'num_input_tokens_seen': 5609232} +{'loss': 1.3338, 'grad_norm': 0.6335726976394653, 'learning_rate': 8.692276703129421e-05, 'epoch': 3.88, 'num_input_tokens_seen': 5667072} +{'loss': 1.3997, 'grad_norm': 0.7418395280838013, 'learning_rate': 8.656768508095853e-05, 'epoch': 3.92, 'num_input_tokens_seen': 5722672} +{'loss': 1.434, 'grad_norm': 0.8165796399116516, 'learning_rate': 8.620859307187339e-05, 'epoch': 3.96, 'num_input_tokens_seen': 5780096} +{'loss': 1.3018, 'grad_norm': 0.905696451663971, 'learning_rate': 8.584553038252414e-05, 'epoch': 3.99, 'num_input_tokens_seen': 5835632} +{'loss': 1.2729, 'grad_norm': 0.7397456765174866, 'learning_rate': 8.547853682682604e-05, 'epoch': 4.03, 'num_input_tokens_seen': 5891616} +{'loss': 1.1823, 'grad_norm': 0.7480332255363464, 'learning_rate': 8.510765264975813e-05, 'epoch': 4.07, 'num_input_tokens_seen': 5942400} +{'loss': 1.4483, 'grad_norm': 0.9288114905357361, 'learning_rate': 8.473291852294987e-05, 'epoch': 4.11, 'num_input_tokens_seen': 5997728} +{'loss': 1.1807, 'grad_norm': 0.9873988032341003, 'learning_rate': 8.435437554022115e-05, 'epoch': 4.14, 'num_input_tokens_seen': 6049312} +{'loss': 1.2298, 'grad_norm': 0.7703122496604919, 'learning_rate': 8.397206521307584e-05, 'epoch': 4.18, 'num_input_tokens_seen': 6108288} +{'loss': 1.1629, 'grad_norm': 0.890841007232666, 'learning_rate': 8.358602946614951e-05, 'epoch': 4.22, 'num_input_tokens_seen': 6160144} +{'loss': 1.2641, 'grad_norm': 0.8476828932762146, 'learning_rate': 8.319631063261209e-05, 'epoch': 4.26, 'num_input_tokens_seen': 6216672} +{'loss': 1.2478, 'grad_norm': 0.8666311502456665, 'learning_rate': 8.280295144952536e-05, 'epoch': 4.29, 'num_input_tokens_seen': 6271168} +{'loss': 1.2675, 'grad_norm': 1.005199670791626, 'learning_rate': 8.240599505315655e-05, 'epoch': 4.33, 'num_input_tokens_seen': 6326992} +{'loss': 1.2551, 'grad_norm': 0.9311610460281372, 'learning_rate': 8.200548497424778e-05, 'epoch': 4.37, 'num_input_tokens_seen': 6380944} +{'loss': 1.2011, 'grad_norm': 0.8687139749526978, 'learning_rate': 8.160146513324254e-05, 'epoch': 4.4, 'num_input_tokens_seen': 6436144} +{'loss': 1.1243, 'grad_norm': 0.8516527414321899, 'learning_rate': 8.119397983546932e-05, 'epoch': 4.44, 'num_input_tokens_seen': 6487824} +{'loss': 1.2775, 'grad_norm': 0.9878633618354797, 'learning_rate': 8.07830737662829e-05, 'epoch': 4.48, 'num_input_tokens_seen': 6540448} +{'loss': 1.208, 'grad_norm': 1.2680439949035645, 'learning_rate': 8.036879198616434e-05, 'epoch': 4.52, 'num_input_tokens_seen': 6593248} +{'loss': 1.1828, 'grad_norm': 0.96169513463974, 'learning_rate': 7.99511799257793e-05, 'epoch': 4.55, 'num_input_tokens_seen': 6648976} +{'loss': 1.2522, 'grad_norm': 1.041894555091858, 'learning_rate': 7.953028338099627e-05, 'epoch': 4.59, 'num_input_tokens_seen': 6703440} +{'loss': 1.2285, 'grad_norm': 1.0338892936706543, 'learning_rate': 7.910614850786448e-05, 'epoch': 4.63, 'num_input_tokens_seen': 6755360} +{'loss': 1.2556, 'grad_norm': 0.849757969379425, 'learning_rate': 7.86788218175523e-05, 'epoch': 4.67, 'num_input_tokens_seen': 6811440} +{'loss': 1.3034, 'grad_norm': 0.8915488719940186, 'learning_rate': 7.82483501712469e-05, 'epoch': 4.7, 'num_input_tokens_seen': 6870992} +{'loss': 1.2352, 'grad_norm': 1.2227041721343994, 'learning_rate': 7.781478077501525e-05, 'epoch': 4.74, 'num_input_tokens_seen': 6927120} +{'loss': 1.2768, 'grad_norm': 0.7694201469421387, 'learning_rate': 7.737816117462752e-05, 'epoch': 4.78, 'num_input_tokens_seen': 6983216} +{'loss': 1.3318, 'grad_norm': 0.880551815032959, 'learning_rate': 7.693853925034315e-05, 'epoch': 4.81, 'num_input_tokens_seen': 7041648} +{'loss': 1.2715, 'grad_norm': 0.9098656177520752, 'learning_rate': 7.649596321166024e-05, 'epoch': 4.85, 'num_input_tokens_seen': 7095104} +{'loss': 1.1691, 'grad_norm': 1.0869756937026978, 'learning_rate': 7.605048159202883e-05, 'epoch': 4.89, 'num_input_tokens_seen': 7146512} +{'loss': 1.2651, 'grad_norm': 1.0197746753692627, 'learning_rate': 7.560214324352858e-05, 'epoch': 4.93, 'num_input_tokens_seen': 7198704} +{'loss': 1.3019, 'grad_norm': 0.8996115326881409, 'learning_rate': 7.515099733151177e-05, 'epoch': 4.96, 'num_input_tokens_seen': 7254608} +{'loss': 1.2474, 'grad_norm': 0.9211968779563904, 'learning_rate': 7.469709332921155e-05, 'epoch': 5.0, 'num_input_tokens_seen': 7312224} +{'loss': 1.1313, 'grad_norm': 0.8490816354751587, 'learning_rate': 7.424048101231686e-05, 'epoch': 5.04, 'num_input_tokens_seen': 7365456} +{'loss': 1.265, 'grad_norm': 1.145821452140808, 'learning_rate': 7.378121045351378e-05, 'epoch': 5.08, 'num_input_tokens_seen': 7421200} +{'loss': 1.1876, 'grad_norm': 1.3000530004501343, 'learning_rate': 7.331933201699457e-05, 'epoch': 5.11, 'num_input_tokens_seen': 7474704} +{'loss': 1.063, 'grad_norm': 1.0333030223846436, 'learning_rate': 7.285489635293472e-05, 'epoch': 5.15, 'num_input_tokens_seen': 7527856} +{'loss': 1.1132, 'grad_norm': 1.1004230976104736, 'learning_rate': 7.238795439193848e-05, 'epoch': 5.19, 'num_input_tokens_seen': 7585040} +{'loss': 1.1264, 'grad_norm': 1.0957094430923462, 'learning_rate': 7.191855733945387e-05, 'epoch': 5.23, 'num_input_tokens_seen': 7643952} +{'loss': 1.1316, 'grad_norm': 1.0807350873947144, 'learning_rate': 7.14467566701573e-05, 'epoch': 5.26, 'num_input_tokens_seen': 7699952} +{'loss': 1.1275, 'grad_norm': 1.1621692180633545, 'learning_rate': 7.097260412230886e-05, 'epoch': 5.3, 'num_input_tokens_seen': 7754384} +{'loss': 1.1736, 'grad_norm': 1.1265610456466675, 'learning_rate': 7.049615169207864e-05, 'epoch': 5.34, 'num_input_tokens_seen': 7811760} +{'loss': 1.1063, 'grad_norm': 1.049169659614563, 'learning_rate': 7.001745162784477e-05, 'epoch': 5.37, 'num_input_tokens_seen': 7867376} +{'loss': 1.0924, 'grad_norm': 1.2782608270645142, 'learning_rate': 6.953655642446368e-05, 'epoch': 5.41, 'num_input_tokens_seen': 7920736} +{'loss': 1.1583, 'grad_norm': 1.0864709615707397, 'learning_rate': 6.905351881751372e-05, 'epoch': 5.45, 'num_input_tokens_seen': 7975088} +{'loss': 1.0812, 'grad_norm': 1.2666634321212769, 'learning_rate': 6.856839177751176e-05, 'epoch': 5.49, 'num_input_tokens_seen': 8027328} +{'loss': 1.0749, 'grad_norm': 0.9478985667228699, 'learning_rate': 6.808122850410461e-05, 'epoch': 5.52, 'num_input_tokens_seen': 8083392} +{'loss': 1.1026, 'grad_norm': 1.4468153715133667, 'learning_rate': 6.759208242023509e-05, 'epoch': 5.56, 'num_input_tokens_seen': 8134800} +{'loss': 1.092, 'grad_norm': 1.2189656496047974, 'learning_rate': 6.710100716628344e-05, 'epoch': 5.6, 'num_input_tokens_seen': 8188944} +{'loss': 1.2196, 'grad_norm': 1.040446400642395, 'learning_rate': 6.660805659418516e-05, 'epoch': 5.64, 'num_input_tokens_seen': 8248096} +{'loss': 1.1435, 'grad_norm': 1.137902021408081, 'learning_rate': 6.611328476152557e-05, 'epoch': 5.67, 'num_input_tokens_seen': 8302864} +{'loss': 1.0955, 'grad_norm': 1.19194757938385, 'learning_rate': 6.561674592561163e-05, 'epoch': 5.71, 'num_input_tokens_seen': 8356800} +{'loss': 1.2012, 'grad_norm': 1.2011014223098755, 'learning_rate': 6.511849453752223e-05, 'epoch': 5.75, 'num_input_tokens_seen': 8410576} +{'loss': 1.1735, 'grad_norm': 1.2967370748519897, 'learning_rate': 6.461858523613684e-05, 'epoch': 5.79, 'num_input_tokens_seen': 8468016} +{'loss': 1.1583, 'grad_norm': 1.2468522787094116, 'learning_rate': 6.411707284214384e-05, 'epoch': 5.82, 'num_input_tokens_seen': 8518768} +{'loss': 1.1513, 'grad_norm': 1.3397902250289917, 'learning_rate': 6.361401235202872e-05, 'epoch': 5.86, 'num_input_tokens_seen': 8573872} +{'loss': 1.06, 'grad_norm': 1.1201503276824951, 'learning_rate': 6.310945893204324e-05, 'epoch': 5.9, 'num_input_tokens_seen': 8628928} +{'loss': 1.1553, 'grad_norm': 1.3698691129684448, 'learning_rate': 6.26034679121557e-05, 'epoch': 5.93, 'num_input_tokens_seen': 8681104} +{'loss': 1.1209, 'grad_norm': 1.2262195348739624, 'learning_rate': 6.209609477998338e-05, 'epoch': 5.97, 'num_input_tokens_seen': 8735984} +{'loss': 1.1008, 'grad_norm': 0.9644012451171875, 'learning_rate': 6.158739517470786e-05, 'epoch': 6.01, 'num_input_tokens_seen': 8790416} +{'loss': 1.0679, 'grad_norm': 1.0709476470947266, 'learning_rate': 6.107742488097338e-05, 'epoch': 6.05, 'num_input_tokens_seen': 8848464} +{'loss': 1.0223, 'grad_norm': 1.0696582794189453, 'learning_rate': 6.056623982276944e-05, 'epoch': 6.08, 'num_input_tokens_seen': 8900736} +{'loss': 1.0493, 'grad_norm': 1.2330180406570435, 'learning_rate': 6.005389605729824e-05, 'epoch': 6.12, 'num_input_tokens_seen': 8957616} +{'loss': 1.0017, 'grad_norm': 1.1921484470367432, 'learning_rate': 5.9540449768827246e-05, 'epoch': 6.16, 'num_input_tokens_seen': 9013680} +{'loss': 1.0215, 'grad_norm': 1.383009433746338, 'learning_rate': 5.902595726252801e-05, 'epoch': 6.2, 'num_input_tokens_seen': 9069536} +{'eval_loss': 1.6311813592910767, 'eval_runtime': 10.0618, 'eval_samples_per_second': 99.386, 'eval_steps_per_second': 7.156, 'epoch': 6.22, 'num_input_tokens_seen': 9110112} +{'loss': 1.0748, 'grad_norm': 1.2158509492874146, 'learning_rate': 5.851047495830163e-05, 'epoch': 6.23, 'num_input_tokens_seen': 9127472} +{'loss': 0.9791, 'grad_norm': 1.2159379720687866, 'learning_rate': 5.799405938459175e-05, 'epoch': 6.27, 'num_input_tokens_seen': 9176608} +{'loss': 1.0378, 'grad_norm': 1.4739444255828857, 'learning_rate': 5.747676717218549e-05, 'epoch': 6.31, 'num_input_tokens_seen': 9228576} +{'loss': 1.0241, 'grad_norm': 1.128970980644226, 'learning_rate': 5.695865504800327e-05, 'epoch': 6.35, 'num_input_tokens_seen': 9286272} +{'loss': 1.0465, 'grad_norm': 1.5108153820037842, 'learning_rate': 5.643977982887815e-05, 'epoch': 6.38, 'num_input_tokens_seen': 9342096} +{'loss': 1.0456, 'grad_norm': 1.2417353391647339, 'learning_rate': 5.5920198415325064e-05, 'epoch': 6.42, 'num_input_tokens_seen': 9396240} +{'loss': 1.0381, 'grad_norm': 1.093074083328247, 'learning_rate': 5.539996778530115e-05, 'epoch': 6.46, 'num_input_tokens_seen': 9451488} +{'loss': 1.1424, 'grad_norm': 1.3556379079818726, 'learning_rate': 5.487914498795747e-05, 'epoch': 6.49, 'num_input_tokens_seen': 9506784} +{'loss': 1.0076, 'grad_norm': 1.4488335847854614, 'learning_rate': 5.435778713738292e-05, 'epoch': 6.53, 'num_input_tokens_seen': 9562448} +{'loss': 1.0462, 'grad_norm': 1.2876710891723633, 'learning_rate': 5.383595140634093e-05, 'epoch': 6.57, 'num_input_tokens_seen': 9615664} +{'loss': 1.0512, 'grad_norm': 1.330777645111084, 'learning_rate': 5.3313695020000024e-05, 'epoch': 6.61, 'num_input_tokens_seen': 9671744} +{'loss': 0.9775, 'grad_norm': 1.5395786762237549, 'learning_rate': 5.279107524965819e-05, 'epoch': 6.64, 'num_input_tokens_seen': 9724048} +{'loss': 1.0258, 'grad_norm': 1.4420833587646484, 'learning_rate': 5.226814940646269e-05, 'epoch': 6.68, 'num_input_tokens_seen': 9776176} +{'loss': 1.0739, 'grad_norm': 1.342041254043579, 'learning_rate': 5.174497483512506e-05, 'epoch': 6.72, 'num_input_tokens_seen': 9832192} +{'loss': 1.0348, 'grad_norm': 1.5209355354309082, 'learning_rate': 5.1221608907632665e-05, 'epoch': 6.76, 'num_input_tokens_seen': 9886048} +{'loss': 1.0928, 'grad_norm': 1.2620528936386108, 'learning_rate': 5.0698109016957274e-05, 'epoch': 6.79, 'num_input_tokens_seen': 9939184} +{'loss': 1.004, 'grad_norm': 1.2428035736083984, 'learning_rate': 5.017453257076119e-05, 'epoch': 6.83, 'num_input_tokens_seen': 9992272} +{'loss': 0.9519, 'grad_norm': 1.2153327465057373, 'learning_rate': 4.965093698510193e-05, 'epoch': 6.87, 'num_input_tokens_seen': 10045168} +{'loss': 1.1466, 'grad_norm': 1.3050085306167603, 'learning_rate': 4.912737967813583e-05, 'epoch': 6.91, 'num_input_tokens_seen': 10102720} +{'loss': 1.0908, 'grad_norm': 1.3073248863220215, 'learning_rate': 4.860391806382157e-05, 'epoch': 6.94, 'num_input_tokens_seen': 10156832} +{'loss': 0.9568, 'grad_norm': 1.280698299407959, 'learning_rate': 4.8080609545624004e-05, 'epoch': 6.98, 'num_input_tokens_seen': 10210208} +{'loss': 1.0496, 'grad_norm': 1.1008414030075073, 'learning_rate': 4.755751151021934e-05, 'epoch': 7.02, 'num_input_tokens_seen': 10267456} +{'loss': 0.8353, 'grad_norm': 1.4668625593185425, 'learning_rate': 4.703468132120193e-05, 'epoch': 7.05, 'num_input_tokens_seen': 10317312} +{'loss': 0.9929, 'grad_norm': 1.4211206436157227, 'learning_rate': 4.6512176312793736e-05, 'epoch': 7.09, 'num_input_tokens_seen': 10373232} +{'loss': 1.0196, 'grad_norm': 1.2859231233596802, 'learning_rate': 4.599005378355706e-05, 'epoch': 7.13, 'num_input_tokens_seen': 10429936} +{'loss': 0.9094, 'grad_norm': 1.291049599647522, 'learning_rate': 4.5468370990111006e-05, 'epoch': 7.17, 'num_input_tokens_seen': 10485824} +{'loss': 0.9206, 'grad_norm': 1.2573702335357666, 'learning_rate': 4.494718514085268e-05, 'epoch': 7.2, 'num_input_tokens_seen': 10540736} +{'loss': 1.0173, 'grad_norm': 1.6116251945495605, 'learning_rate': 4.442655338968373e-05, 'epoch': 7.24, 'num_input_tokens_seen': 10596192} +{'loss': 1.0234, 'grad_norm': 1.2712162733078003, 'learning_rate': 4.390653282974264e-05, 'epoch': 7.28, 'num_input_tokens_seen': 10651376} +{'loss': 0.9771, 'grad_norm': 1.3022940158843994, 'learning_rate': 4.3387180487143876e-05, 'epoch': 7.32, 'num_input_tokens_seen': 10704512} +{'loss': 0.8928, 'grad_norm': 1.3681883811950684, 'learning_rate': 4.2868553314724425e-05, 'epoch': 7.35, 'num_input_tokens_seen': 10758656} +{'loss': 1.0248, 'grad_norm': 1.2594826221466064, 'learning_rate': 4.23507081857981e-05, 'epoch': 7.39, 'num_input_tokens_seen': 10814112} +{'loss': 0.8644, 'grad_norm': 1.401910424232483, 'learning_rate': 4.1833701887918904e-05, 'epoch': 7.43, 'num_input_tokens_seen': 10869264} +{'loss': 0.9704, 'grad_norm': 1.2500008344650269, 'learning_rate': 4.131759111665349e-05, 'epoch': 7.47, 'num_input_tokens_seen': 10922736} +{'loss': 1.0462, 'grad_norm': 1.280358076095581, 'learning_rate': 4.080243246936399e-05, 'epoch': 7.5, 'num_input_tokens_seen': 10979712} +{'loss': 0.9357, 'grad_norm': 1.3356202840805054, 'learning_rate': 4.028828243900141e-05, 'epoch': 7.54, 'num_input_tokens_seen': 11036672} +{'loss': 0.9906, 'grad_norm': 1.3757556676864624, 'learning_rate': 3.9775197407910485e-05, 'epoch': 7.58, 'num_input_tokens_seen': 11092496} +{'loss': 0.9157, 'grad_norm': 1.389939308166504, 'learning_rate': 3.926323364164684e-05, 'epoch': 7.61, 'num_input_tokens_seen': 11143456} +{'loss': 0.9151, 'grad_norm': 1.3326400518417358, 'learning_rate': 3.875244728280676e-05, 'epoch': 7.65, 'num_input_tokens_seen': 11200064} +{'loss': 1.0072, 'grad_norm': 1.2925583124160767, 'learning_rate': 3.82428943448705e-05, 'epoch': 7.69, 'num_input_tokens_seen': 11256752} +{'loss': 0.9138, 'grad_norm': 1.608323574066162, 'learning_rate': 3.773463070605987e-05, 'epoch': 7.73, 'num_input_tokens_seen': 11311200} +{'loss': 0.9126, 'grad_norm': 1.2572425603866577, 'learning_rate': 3.7227712103210486e-05, 'epoch': 7.76, 'num_input_tokens_seen': 11362640} +{'loss': 0.921, 'grad_norm': 1.366409420967102, 'learning_rate': 3.6722194125659556e-05, 'epoch': 7.8, 'num_input_tokens_seen': 11420080} +{'loss': 0.9167, 'grad_norm': 1.367814302444458, 'learning_rate': 3.6218132209150045e-05, 'epoch': 7.84, 'num_input_tokens_seen': 11471056} +{'loss': 1.0907, 'grad_norm': 1.4953104257583618, 'learning_rate': 3.5715581629751326e-05, 'epoch': 7.88, 'num_input_tokens_seen': 11526928} +{'loss': 0.9222, 'grad_norm': 1.4320324659347534, 'learning_rate': 3.5214597497797684e-05, 'epoch': 7.91, 'num_input_tokens_seen': 11580928} +{'loss': 0.9267, 'grad_norm': 1.6235154867172241, 'learning_rate': 3.471523475184472e-05, 'epoch': 7.95, 'num_input_tokens_seen': 11634416} +{'loss': 0.9974, 'grad_norm': 1.4394381046295166, 'learning_rate': 3.4217548152644885e-05, 'epoch': 7.99, 'num_input_tokens_seen': 11688928} +{'loss': 0.924, 'grad_norm': 1.2756644487380981, 'learning_rate': 3.372159227714218e-05, 'epoch': 8.02, 'num_input_tokens_seen': 11741968} +{'loss': 0.7697, 'grad_norm': 1.3790043592453003, 'learning_rate': 3.322742151248725e-05, 'epoch': 8.06, 'num_input_tokens_seen': 11794432} +{'loss': 0.9866, 'grad_norm': 1.3841499090194702, 'learning_rate': 3.273509005007327e-05, 'epoch': 8.1, 'num_input_tokens_seen': 11849744} +{'loss': 0.9879, 'grad_norm': 1.5157493352890015, 'learning_rate': 3.224465187959316e-05, 'epoch': 8.14, 'num_input_tokens_seen': 11904800} +{'loss': 0.8112, 'grad_norm': 1.4144805669784546, 'learning_rate': 3.1756160783119016e-05, 'epoch': 8.17, 'num_input_tokens_seen': 11960208} +{'loss': 0.8722, 'grad_norm': 1.2442865371704102, 'learning_rate': 3.12696703292044e-05, 'epoch': 8.21, 'num_input_tokens_seen': 12012304} +{'loss': 0.9897, 'grad_norm': 1.4178701639175415, 'learning_rate': 3.078523386700982e-05, 'epoch': 8.25, 'num_input_tokens_seen': 12067760} +{'loss': 0.9071, 'grad_norm': 1.380922794342041, 'learning_rate': 3.0302904520452447e-05, 'epoch': 8.29, 'num_input_tokens_seen': 12127248} +{'loss': 0.8776, 'grad_norm': 1.3610669374465942, 'learning_rate': 2.9822735182380496e-05, 'epoch': 8.32, 'num_input_tokens_seen': 12183520} +{'loss': 0.9587, 'grad_norm': 1.3397613763809204, 'learning_rate': 2.934477850877292e-05, 'epoch': 8.36, 'num_input_tokens_seen': 12240960} +{'loss': 0.8807, 'grad_norm': 1.259645938873291, 'learning_rate': 2.886908691296504e-05, 'epoch': 8.4, 'num_input_tokens_seen': 12295600} +{'loss': 0.8717, 'grad_norm': 1.4922188520431519, 'learning_rate': 2.8395712559900877e-05, 'epoch': 8.44, 'num_input_tokens_seen': 12349072} +{'loss': 0.8578, 'grad_norm': 1.2398678064346313, 'learning_rate': 2.7924707360412746e-05, 'epoch': 8.47, 'num_input_tokens_seen': 12403360} +{'loss': 0.9193, 'grad_norm': 1.421632170677185, 'learning_rate': 2.7456122965528475e-05, 'epoch': 8.51, 'num_input_tokens_seen': 12462624} +{'loss': 0.909, 'grad_norm': 1.4103708267211914, 'learning_rate': 2.699001076080742e-05, 'epoch': 8.55, 'num_input_tokens_seen': 12519456} +{'loss': 0.9335, 'grad_norm': 1.4057148694992065, 'learning_rate': 2.6526421860705473e-05, 'epoch': 8.58, 'num_input_tokens_seen': 12573888} +{'loss': 0.8227, 'grad_norm': 1.882209300994873, 'learning_rate': 2.6065407102969664e-05, 'epoch': 8.62, 'num_input_tokens_seen': 12627520} +{'loss': 0.8731, 'grad_norm': 1.6502835750579834, 'learning_rate': 2.560701704306336e-05, 'epoch': 8.66, 'num_input_tokens_seen': 12680768} +{'loss': 0.8538, 'grad_norm': 1.6794430017471313, 'learning_rate': 2.5151301948622237e-05, 'epoch': 8.7, 'num_input_tokens_seen': 12734640} +{'loss': 0.8654, 'grad_norm': 1.4031795263290405, 'learning_rate': 2.469831179394182e-05, 'epoch': 8.73, 'num_input_tokens_seen': 12787200} +{'loss': 0.8738, 'grad_norm': 1.460951328277588, 'learning_rate': 2.4248096254497288e-05, 'epoch': 8.77, 'num_input_tokens_seen': 12840064} +{'loss': 0.9721, 'grad_norm': 1.3095109462738037, 'learning_rate': 2.3800704701496053e-05, 'epoch': 8.81, 'num_input_tokens_seen': 12895776} +{'loss': 0.9084, 'grad_norm': 1.21684992313385, 'learning_rate': 2.33561861964635e-05, 'epoch': 8.85, 'num_input_tokens_seen': 12946496} +{'loss': 0.8723, 'grad_norm': 1.4043666124343872, 'learning_rate': 2.2914589485863014e-05, 'epoch': 8.88, 'num_input_tokens_seen': 12999616} +{'loss': 0.8646, 'grad_norm': 1.6057589054107666, 'learning_rate': 2.247596299575022e-05, 'epoch': 8.92, 'num_input_tokens_seen': 13053840} +{'loss': 0.869, 'grad_norm': 1.4530081748962402, 'learning_rate': 2.2040354826462668e-05, 'epoch': 8.96, 'num_input_tokens_seen': 13107104} +{'loss': 0.8733, 'grad_norm': 1.2964198589324951, 'learning_rate': 2.160781274734495e-05, 'epoch': 9.0, 'num_input_tokens_seen': 13162592} +{'loss': 0.9076, 'grad_norm': 1.2949548959732056, 'learning_rate': 2.117838419151034e-05, 'epoch': 9.03, 'num_input_tokens_seen': 13217488} +{'loss': 0.828, 'grad_norm': 1.4165751934051514, 'learning_rate': 2.0752116250639225e-05, 'epoch': 9.07, 'num_input_tokens_seen': 13270704} +{'loss': 0.8922, 'grad_norm': 1.1998887062072754, 'learning_rate': 2.0329055669814934e-05, 'epoch': 9.11, 'num_input_tokens_seen': 13325168} +{'loss': 0.8684, 'grad_norm': 1.4165197610855103, 'learning_rate': 1.9909248842397584e-05, 'epoch': 9.14, 'num_input_tokens_seen': 13385488} +{'loss': 0.8703, 'grad_norm': 1.4951928853988647, 'learning_rate': 1.9492741804936622e-05, 'epoch': 9.18, 'num_input_tokens_seen': 13439872} +{'loss': 0.8381, 'grad_norm': 1.4441372156143188, 'learning_rate': 1.9079580232122303e-05, 'epoch': 9.22, 'num_input_tokens_seen': 13494080} +{'loss': 0.9077, 'grad_norm': 1.3826444149017334, 'learning_rate': 1.866980943177699e-05, 'epoch': 9.26, 'num_input_tokens_seen': 13552208} +{'loss': 0.8887, 'grad_norm': 1.3692766427993774, 'learning_rate': 1.8263474339886628e-05, 'epoch': 9.29, 'num_input_tokens_seen': 13608832} +{'loss': 0.8205, 'grad_norm': 1.2762523889541626, 'learning_rate': 1.7860619515673033e-05, 'epoch': 9.33, 'num_input_tokens_seen': 13663760} +{'loss': 0.8274, 'grad_norm': 1.3160815238952637, 'learning_rate': 1.746128913670746e-05, 'epoch': 9.37, 'num_input_tokens_seen': 13716800} +{'loss': 0.9119, 'grad_norm': 1.5260809659957886, 'learning_rate': 1.7065526994065973e-05, 'epoch': 9.41, 'num_input_tokens_seen': 13774816} +{'loss': 0.8099, 'grad_norm': 1.2529041767120361, 'learning_rate': 1.667337648752738e-05, 'epoch': 9.44, 'num_input_tokens_seen': 13830048} +{'loss': 0.8317, 'grad_norm': 1.3622620105743408, 'learning_rate': 1.6284880620813848e-05, 'epoch': 9.48, 'num_input_tokens_seen': 13885008} +{'loss': 0.7895, 'grad_norm': 1.245958924293518, 'learning_rate': 1.5900081996875083e-05, 'epoch': 9.52, 'num_input_tokens_seen': 13939504} +{'loss': 0.8474, 'grad_norm': 1.214984655380249, 'learning_rate': 1.551902281321651e-05, 'epoch': 9.56, 'num_input_tokens_seen': 13992352} +{'loss': 0.8418, 'grad_norm': 1.4553762674331665, 'learning_rate': 1.5141744857271778e-05, 'epoch': 9.59, 'num_input_tokens_seen': 14044656} +{'loss': 0.7605, 'grad_norm': 1.6518710851669312, 'learning_rate': 1.4768289501820265e-05, 'epoch': 9.63, 'num_input_tokens_seen': 14095664} +{'loss': 0.8928, 'grad_norm': 1.6158061027526855, 'learning_rate': 1.439869770045018e-05, 'epoch': 9.67, 'num_input_tokens_seen': 14151808} +{'loss': 0.7745, 'grad_norm': 1.4202730655670166, 'learning_rate': 1.4033009983067452e-05, 'epoch': 9.7, 'num_input_tokens_seen': 14202128} +{'loss': 0.8861, 'grad_norm': 1.5133007764816284, 'learning_rate': 1.367126645145121e-05, 'epoch': 9.74, 'num_input_tokens_seen': 14254320} +{'loss': 0.7954, 'grad_norm': 1.3368923664093018, 'learning_rate': 1.3313506774856177e-05, 'epoch': 9.78, 'num_input_tokens_seen': 14307104} +{'loss': 0.8982, 'grad_norm': 1.5365697145462036, 'learning_rate': 1.29597701856625e-05, 'epoch': 9.82, 'num_input_tokens_seen': 14364592} +{'loss': 0.8608, 'grad_norm': 1.4836835861206055, 'learning_rate': 1.2610095475073414e-05, 'epoch': 9.85, 'num_input_tokens_seen': 14417696} +{'loss': 0.8443, 'grad_norm': 1.3931705951690674, 'learning_rate': 1.22645209888614e-05, 'epoch': 9.89, 'num_input_tokens_seen': 14472720} +{'loss': 0.8242, 'grad_norm': 1.2984020709991455, 'learning_rate': 1.1923084623163172e-05, 'epoch': 9.93, 'num_input_tokens_seen': 14526336} +{'loss': 0.7819, 'grad_norm': 1.428997278213501, 'learning_rate': 1.1585823820323843e-05, 'epoch': 9.97, 'num_input_tokens_seen': 14577424} +{'loss': 0.8295, 'grad_norm': 1.3891069889068604, 'learning_rate': 1.1252775564791024e-05, 'epoch': 10.0, 'num_input_tokens_seen': 14633552} +{'loss': 0.8614, 'grad_norm': 1.2659319639205933, 'learning_rate': 1.0923976379059058e-05, 'epoch': 10.04, 'num_input_tokens_seen': 14690976} +{'loss': 0.7815, 'grad_norm': 1.2245172262191772, 'learning_rate': 1.0599462319663905e-05, 'epoch': 10.08, 'num_input_tokens_seen': 14742128} +{'loss': 0.7795, 'grad_norm': 1.368401288986206, 'learning_rate': 1.0279268973229089e-05, 'epoch': 10.12, 'num_input_tokens_seen': 14794288} +{'loss': 0.7585, 'grad_norm': 1.4876455068588257, 'learning_rate': 9.963431452563332e-06, 'epoch': 10.15, 'num_input_tokens_seen': 14846560} +{'loss': 0.7785, 'grad_norm': 1.377921223640442, 'learning_rate': 9.651984392809914e-06, 'epoch': 10.19, 'num_input_tokens_seen': 14900528} +{'loss': 0.848, 'grad_norm': 1.3406357765197754, 'learning_rate': 9.344961947648623e-06, 'epoch': 10.23, 'num_input_tokens_seen': 14956624} +{'loss': 0.8245, 'grad_norm': 1.312232494354248, 'learning_rate': 9.042397785550405e-06, 'epoch': 10.26, 'num_input_tokens_seen': 15013488} +{'loss': 0.811, 'grad_norm': 1.317514419555664, 'learning_rate': 8.744325086085248e-06, 'epoch': 10.3, 'num_input_tokens_seen': 15068000} +{'loss': 0.8597, 'grad_norm': 1.4466396570205688, 'learning_rate': 8.450776536283594e-06, 'epoch': 10.34, 'num_input_tokens_seen': 15124672} +{'loss': 0.7463, 'grad_norm': 1.323728084564209, 'learning_rate': 8.16178432705192e-06, 'epoch': 10.38, 'num_input_tokens_seen': 15174960} +{'loss': 0.7385, 'grad_norm': 1.355770468711853, 'learning_rate': 7.877380149642626e-06, 'epoch': 10.41, 'num_input_tokens_seen': 15228192} +{'loss': 0.8362, 'grad_norm': 1.2545582056045532, 'learning_rate': 7.597595192178702e-06, 'epoch': 10.45, 'num_input_tokens_seen': 15282256} +{'loss': 0.7866, 'grad_norm': 1.336365818977356, 'learning_rate': 7.322460136233622e-06, 'epoch': 10.49, 'num_input_tokens_seen': 15337392} +{'loss': 0.8247, 'grad_norm': 1.2446874380111694, 'learning_rate': 7.052005153466779e-06, 'epoch': 10.53, 'num_input_tokens_seen': 15392448} +{'loss': 0.8383, 'grad_norm': 1.2695002555847168, 'learning_rate': 6.786259902314768e-06, 'epoch': 10.56, 'num_input_tokens_seen': 15446672} +{'loss': 0.7268, 'grad_norm': 1.2946388721466064, 'learning_rate': 6.52525352473905e-06, 'epoch': 10.6, 'num_input_tokens_seen': 15496064} +{'loss': 0.8733, 'grad_norm': 1.3027721643447876, 'learning_rate': 6.269014643030213e-06, 'epoch': 10.64, 'num_input_tokens_seen': 15549952} +{'loss': 0.8492, 'grad_norm': 1.340345025062561, 'learning_rate': 6.017571356669183e-06, 'epoch': 10.67, 'num_input_tokens_seen': 15607920} +{'loss': 0.8154, 'grad_norm': 1.272009015083313, 'learning_rate': 5.770951239245803e-06, 'epoch': 10.71, 'num_input_tokens_seen': 15663280} +{'loss': 0.7953, 'grad_norm': 1.3935860395431519, 'learning_rate': 5.529181335435124e-06, 'epoch': 10.75, 'num_input_tokens_seen': 15718496} +{'loss': 0.8559, 'grad_norm': 1.3685379028320312, 'learning_rate': 5.292288158031594e-06, 'epoch': 10.79, 'num_input_tokens_seen': 15772560} +{'loss': 0.8111, 'grad_norm': 1.5064653158187866, 'learning_rate': 5.060297685041659e-06, 'epoch': 10.82, 'num_input_tokens_seen': 15828464} +{'loss': 0.8946, 'grad_norm': 1.5226088762283325, 'learning_rate': 4.833235356834959e-06, 'epoch': 10.86, 'num_input_tokens_seen': 15886160} +{'loss': 0.831, 'grad_norm': 1.6443517208099365, 'learning_rate': 4.611126073354571e-06, 'epoch': 10.9, 'num_input_tokens_seen': 15938720} +{'loss': 0.8318, 'grad_norm': 1.442195177078247, 'learning_rate': 4.3939941913863525e-06, 'epoch': 10.94, 'num_input_tokens_seen': 15992720} +{'loss': 0.7655, 'grad_norm': 1.4696769714355469, 'learning_rate': 4.181863521888019e-06, 'epoch': 10.97, 'num_input_tokens_seen': 16049584} +{'loss': 0.8017, 'grad_norm': 1.1691123247146606, 'learning_rate': 3.974757327377981e-06, 'epoch': 11.01, 'num_input_tokens_seen': 16103840} +{'loss': 0.8806, 'grad_norm': 1.1555007696151733, 'learning_rate': 3.772698319384349e-06, 'epoch': 11.05, 'num_input_tokens_seen': 16160880} +{'loss': 0.7852, 'grad_norm': 1.264450192451477, 'learning_rate': 3.575708655954324e-06, 'epoch': 11.09, 'num_input_tokens_seen': 16213280} +{'loss': 0.7314, 'grad_norm': 1.1983911991119385, 'learning_rate': 3.3838099392243916e-06, 'epoch': 11.12, 'num_input_tokens_seen': 16265488} +{'loss': 0.809, 'grad_norm': 1.366458773612976, 'learning_rate': 3.197023213051337e-06, 'epoch': 11.16, 'num_input_tokens_seen': 16321040} +{'loss': 0.7076, 'grad_norm': 1.4491347074508667, 'learning_rate': 3.0153689607045845e-06, 'epoch': 11.2, 'num_input_tokens_seen': 16372560} +{'loss': 0.8021, 'grad_norm': 1.481589674949646, 'learning_rate': 2.8388671026199522e-06, 'epoch': 11.23, 'num_input_tokens_seen': 16425856} +{'loss': 0.7633, 'grad_norm': 1.3592849969863892, 'learning_rate': 2.667536994215186e-06, 'epoch': 11.27, 'num_input_tokens_seen': 16480176} +{'loss': 0.7514, 'grad_norm': 1.3644659519195557, 'learning_rate': 2.501397423767382e-06, 'epoch': 11.31, 'num_input_tokens_seen': 16531248} +{'loss': 0.9144, 'grad_norm': 1.3623534440994263, 'learning_rate': 2.340466610352654e-06, 'epoch': 11.35, 'num_input_tokens_seen': 16589952} +{'loss': 0.7871, 'grad_norm': 1.3221518993377686, 'learning_rate': 2.1847622018482283e-06, 'epoch': 11.38, 'num_input_tokens_seen': 16642688} +{'loss': 0.8018, 'grad_norm': 1.3355658054351807, 'learning_rate': 2.0343012729971243e-06, 'epoch': 11.42, 'num_input_tokens_seen': 16698016} +{'loss': 0.7833, 'grad_norm': 1.3741073608398438, 'learning_rate': 1.8891003235357308e-06, 'epoch': 11.46, 'num_input_tokens_seen': 16751600} +{'loss': 0.8172, 'grad_norm': 1.2644001245498657, 'learning_rate': 1.7491752763844293e-06, 'epoch': 11.5, 'num_input_tokens_seen': 16807424} +{'loss': 0.7801, 'grad_norm': 1.3740483522415161, 'learning_rate': 1.6145414759014431e-06, 'epoch': 11.53, 'num_input_tokens_seen': 16863040} +{'loss': 0.8497, 'grad_norm': 1.4756935834884644, 'learning_rate': 1.4852136862001764e-06, 'epoch': 11.57, 'num_input_tokens_seen': 16919056} +{'loss': 0.865, 'grad_norm': 1.253000020980835, 'learning_rate': 1.3612060895301759e-06, 'epoch': 11.61, 'num_input_tokens_seen': 16976880} +{'loss': 0.6891, 'grad_norm': 1.3113460540771484, 'learning_rate': 1.2425322847218368e-06, 'epoch': 11.65, 'num_input_tokens_seen': 17028016} +{'loss': 0.8017, 'grad_norm': 1.2623776197433472, 'learning_rate': 1.1292052856952062e-06, 'epoch': 11.68, 'num_input_tokens_seen': 17084832} +{'loss': 0.8494, 'grad_norm': 1.3193752765655518, 'learning_rate': 1.0212375200327973e-06, 'epoch': 11.72, 'num_input_tokens_seen': 17141568} +{'loss': 0.7922, 'grad_norm': 1.3761273622512817, 'learning_rate': 9.186408276168013e-07, 'epoch': 11.76, 'num_input_tokens_seen': 17197456} +{'loss': 0.7013, 'grad_norm': 1.3351306915283203, 'learning_rate': 8.214264593307098e-07, 'epoch': 11.79, 'num_input_tokens_seen': 17246704} +{'loss': 0.788, 'grad_norm': 1.3546366691589355, 'learning_rate': 7.296050758254957e-07, 'epoch': 11.83, 'num_input_tokens_seen': 17301040} +{'loss': 0.7436, 'grad_norm': 1.461629033088684, 'learning_rate': 6.431867463506048e-07, 'epoch': 11.87, 'num_input_tokens_seen': 17353760} +{'loss': 0.783, 'grad_norm': 1.3888185024261475, 'learning_rate': 5.621809476497098e-07, 'epoch': 11.91, 'num_input_tokens_seen': 17408864} +{'loss': 0.8306, 'grad_norm': 1.4601279497146606, 'learning_rate': 4.865965629214819e-07, 'epoch': 11.94, 'num_input_tokens_seen': 17463616} +{'loss': 0.9427, 'grad_norm': 1.427171230316162, 'learning_rate': 4.1644188084548063e-07, 'epoch': 11.98, 'num_input_tokens_seen': 17521440} +{'loss': 0.8395, 'grad_norm': 1.1976938247680664, 'learning_rate': 3.517245946731529e-07, 'epoch': 12.02, 'num_input_tokens_seen': 17578672} +{'loss': 0.8069, 'grad_norm': 1.3307095766067505, 'learning_rate': 2.924518013842303e-07, 'epoch': 12.06, 'num_input_tokens_seen': 17635856} +{'loss': 0.7471, 'grad_norm': 1.3354995250701904, 'learning_rate': 2.386300009084408e-07, 'epoch': 12.09, 'num_input_tokens_seen': 17689072} +{'loss': 0.8251, 'grad_norm': 1.1849631071090698, 'learning_rate': 1.9026509541272275e-07, 'epoch': 12.13, 'num_input_tokens_seen': 17746800} +{'loss': 0.7807, 'grad_norm': 1.19794762134552, 'learning_rate': 1.4736238865398765e-07, 'epoch': 12.17, 'num_input_tokens_seen': 17799264} +{'loss': 0.8495, 'grad_norm': 1.3394030332565308, 'learning_rate': 1.0992658539750178e-07, 'epoch': 12.21, 'num_input_tokens_seen': 17857296} +{'loss': 0.7786, 'grad_norm': 1.3379496335983276, 'learning_rate': 7.796179090094891e-08, 'epoch': 12.24, 'num_input_tokens_seen': 17911456} +{'loss': 0.7214, 'grad_norm': 1.3171231746673584, 'learning_rate': 5.1471510464268236e-08, 'epoch': 12.28, 'num_input_tokens_seen': 17960800} +{'loss': 0.7353, 'grad_norm': 1.3726296424865723, 'learning_rate': 3.04586490452119e-08, 'epoch': 12.32, 'num_input_tokens_seen': 18014816} +{'loss': 0.7827, 'grad_norm': 1.3374559879302979, 'learning_rate': 1.4925510940844156e-08, 'epoch': 12.35, 'num_input_tokens_seen': 18069152} +{'loss': 0.871, 'grad_norm': 1.2918580770492554, 'learning_rate': 4.873799534788059e-09, 'epoch': 12.39, 'num_input_tokens_seen': 18124176} +{'loss': 0.8343, 'grad_norm': 1.5582739114761353, 'learning_rate': 3.0461711048035415e-10, 'epoch': 12.43, 'num_input_tokens_seen': 18177392} +{'eval_loss': 1.8893193006515503, 'eval_runtime': 10.0832, 'eval_samples_per_second': 99.175, 'eval_steps_per_second': 7.141, 'epoch': 12.44, 'num_input_tokens_seen': 18194112} +{'train_runtime': 3246.117, 'train_samples_per_second': 34.503, 'train_steps_per_second': 0.308, 'train_loss': 1.1796256858706475, 'epoch': 12.44, 'num_input_tokens_seen': 18194112} +***** train metrics ***** + epoch = 12.4417 + num_input_tokens_seen = 18194112 + total_flos = 609198826GF + train_loss = 1.1796 + train_runtime = 0:54:06.11 + train_samples_per_second = 34.503 + train_steps_per_second = 0.308 +Figure saved at: ./results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/training_loss.png +Figure saved at: ./results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/training_eval_loss.png +08/20/2024 09:30:00 - WARNING - llamafactory.extras.ploting - No metric eval_accuracy to plot. +***** eval metrics ***** + epoch = 12.4417 + eval_loss = 1.8893 + eval_runtime = 0:00:10.01 + eval_samples_per_second = 99.899 + eval_steps_per_second = 7.193 + num_input_tokens_seen = 18194112 diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/README.md b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/README.md new file mode 100644 index 00000000..01875388 --- /dev/null +++ b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/README.md @@ -0,0 +1,69 @@ +--- +base_model: ../../llm/chatglm/data +library_name: peft +license: other +tags: +- llama-factory +- lora +- generated_from_trainer +model-index: +- name: ChatGLM2_lora_sft_1 + results: [] +--- + + + +# ChatGLM2_lora_sft_1 + +This model is a fine-tuned version of [../../llm/chatglm/data](https://huggingface.co/../../llm/chatglm/data) on the belle_1m dataset. +It achieves the following results on the evaluation set: +- Loss: 1.8893 +- Num Input Tokens Seen: 18194112 + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0001 +- train_batch_size: 2 +- eval_batch_size: 2 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 7 +- gradient_accumulation_steps: 8 +- total_train_batch_size: 112 +- total_eval_batch_size: 14 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- training_steps: 1000 + +### Training results + +| Training Loss | Epoch | Step | Validation Loss | Input Tokens Seen | +|:-------------:|:-------:|:----:|:---------------:|:-----------------:| +| 1.0215 | 6.2208 | 500 | 1.6312 | 9110112 | +| 0.8343 | 12.4417 | 1000 | 1.8893 | 18194112 | + + +### Framework versions + +- PEFT 0.12.0 +- Transformers 4.43.4 +- Pytorch 2.4.0+cu121 +- Datasets 2.20.0 +- Tokenizers 0.19.1 \ No newline at end of file diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/adapter_config.json b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/adapter_config.json new file mode 100644 index 00000000..da517bb4 --- /dev/null +++ b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/adapter_config.json @@ -0,0 +1,31 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "../../llm/chatglm/data", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "dense_h_to_4h", + "query_key_value", + "dense", + "dense_4h_to_h" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/adapter_model.safetensors b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/adapter_model.safetensors new file mode 100644 index 00000000..ac2ad755 Binary files /dev/null and b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/adapter_model.safetensors differ diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/all_results.json b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/all_results.json new file mode 100644 index 00000000..eceb52b3 --- /dev/null +++ b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/all_results.json @@ -0,0 +1,13 @@ +{ + "epoch": 12.441679626749611, + "eval_loss": 1.8893193006515503, + "eval_runtime": 10.0101, + "eval_samples_per_second": 99.899, + "eval_steps_per_second": 7.193, + "num_input_tokens_seen": 18194112, + "total_flos": 6.541222595474227e+17, + "train_loss": 1.1796256858706475, + "train_runtime": 3246.117, + "train_samples_per_second": 34.503, + "train_steps_per_second": 0.308 +} \ No newline at end of file diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/eval_results.json b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/eval_results.json new file mode 100644 index 00000000..cf800b25 --- /dev/null +++ b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/eval_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 12.441679626749611, + "eval_loss": 1.8893193006515503, + "eval_runtime": 10.0101, + "eval_samples_per_second": 99.899, + "eval_steps_per_second": 7.193, + "num_input_tokens_seen": 18194112 +} \ No newline at end of file diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/special_tokens_map.json b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/special_tokens_map.json new file mode 100644 index 00000000..0967ef42 --- /dev/null +++ b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/special_tokens_map.json @@ -0,0 +1 @@ +{} diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/tokenization_chatglm.py b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/tokenization_chatglm.py new file mode 100644 index 00000000..056d436b --- /dev/null +++ b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/tokenization_chatglm.py @@ -0,0 +1,250 @@ +import os +import torch +from typing import List, Optional, Union, Dict +from sentencepiece import SentencePieceProcessor +from transformers import PreTrainedTokenizer +from transformers.utils import logging, PaddingStrategy +from transformers.tokenization_utils_base import EncodedInput, BatchEncoding + + +class SPTokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.unk_id() + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + self.special_tokens = {} + self.index_special_tokens = {} + for token in special_tokens: + self.special_tokens[token] = self.n_words + self.index_special_tokens[self.n_words] = token + self.n_words += 1 + + def tokenize(self, s: str): + return self.sp_model.EncodeAsPieces(s) + + def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + return self.sp_model.decode(t) + + def decode_tokens(self, tokens: List[str]) -> str: + text = self.sp_model.DecodePieces(tokens) + return text + + def convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + if token in self.special_tokens: + return self.special_tokens[token] + return self.sp_model.PieceToId(token) + + def convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if index in self.index_special_tokens or index in [self.eos_id, self.bos_id, self.pad_id] or index < 0: + return "" + return self.sp_model.IdToPiece(index) + + +class ChatGLMTokenizer(PreTrainedTokenizer): + vocab_files_names = {"vocab_file": "tokenizer.model"} + + model_input_names = ["input_ids", "attention_mask", "position_ids"] + + def __init__(self, vocab_file, padding_side="left", clean_up_tokenization_spaces=False, **kwargs): + self.name = "GLMTokenizer" + + self.vocab_file = vocab_file + self.tokenizer = SPTokenizer(vocab_file) + self.special_tokens = { + "": self.tokenizer.bos_id, + "": self.tokenizer.eos_id, + "": self.tokenizer.pad_id + } + super().__init__(padding_side=padding_side, clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs) + + def get_command(self, token): + if token in self.special_tokens: + return self.special_tokens[token] + assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" + return self.tokenizer.special_tokens[token] + + @property + def unk_token(self) -> str: + return "" + + @property + def pad_token(self) -> str: + return "" + + @property + def pad_token_id(self): + return self.get_command("") + + @property + def eos_token(self) -> str: + return "" + + @property + def eos_token_id(self): + return self.get_command("") + + @property + def vocab_size(self): + return self.tokenizer.n_words + + def get_vocab(self): + """ Returns vocab as a dict """ + vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text, **kwargs): + return self.tokenizer.tokenize(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str) in an id using the vocab. """ + return self.tokenizer.convert_token_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.tokenizer.convert_id_to_token(index) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return self.tokenizer.decode_tokens(tokens) + + def save_vocabulary(self, save_directory, filename_prefix=None): + """ + Save the vocabulary and special tokens file to a directory. + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, self.vocab_files_names["vocab_file"] + ) + else: + vocab_file = save_directory + + with open(self.vocab_file, 'rb') as fin: + proto_str = fin.read() + + with open(vocab_file, "wb") as writer: + writer.write(proto_str) + + return (vocab_file,) + + def get_prefix_tokens(self): + prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] + return prefix_tokens + + def build_prompt(self, query, history=None): + if history is None: + history = [] + prompt = "" + for i, (old_query, response) in enumerate(history): + prompt += "[Round {}]\n\n问:{}\n\n答:{}\n\n".format(i + 1, old_query, response) + prompt += "[Round {}]\n\n问:{}\n\n答:".format(len(history) + 1, query) + return prompt + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + prefix_tokens = self.get_prefix_tokens() + token_ids_0 = prefix_tokens + token_ids_0 + if token_ids_1 is not None: + token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] + return token_ids_0 + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + assert self.padding_side == "left" + + required_input = encoded_inputs[self.model_input_names[0]] + seq_length = len(required_input) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * seq_length + + if "position_ids" not in encoded_inputs: + encoded_inputs["position_ids"] = list(range(seq_length)) + + if needs_to_be_padded: + difference = max_length - len(required_input) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "position_ids" in encoded_inputs: + encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + + return encoded_inputs + diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/tokenizer.model b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/tokenizer.model new file mode 100644 index 00000000..c8336ad0 Binary files /dev/null and b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/tokenizer.model differ diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/tokenizer_config.json b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/tokenizer_config.json new file mode 100644 index 00000000..652904d3 --- /dev/null +++ b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/tokenizer_config.json @@ -0,0 +1,19 @@ +{ + "added_tokens_decoder": {}, + "auto_map": { + "AutoTokenizer": [ + "tokenization_chatglm.ChatGLMTokenizer", + null + ] + }, + "clean_up_tokenization_spaces": false, + "do_lower_case": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "remove_space": false, + "split_special_tokens": false, + "tokenizer_class": "ChatGLMTokenizer", + "unk_token": "" +} diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/train_results.json b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/train_results.json new file mode 100644 index 00000000..7af9bfda --- /dev/null +++ b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/train_results.json @@ -0,0 +1,9 @@ +{ + "epoch": 12.441679626749611, + "num_input_tokens_seen": 18194112, + "total_flos": 6.541222595474227e+17, + "train_loss": 1.1796256858706475, + "train_runtime": 3246.117, + "train_samples_per_second": 34.503, + "train_steps_per_second": 0.308 +} \ No newline at end of file diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/trainer_log.jsonl b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/trainer_log.jsonl new file mode 100644 index 00000000..410cc9f8 --- /dev/null +++ b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/trainer_log.jsonl @@ -0,0 +1,336 @@ +{"current_steps": 3, "total_steps": 1000, "loss": 2.6491, "learning_rate": 3e-06, "epoch": 0.03732503888024884, "percentage": 0.3, "cur_time": "2024-08-20 08:36:06", "elapsed_time": "0:00:11", "remaining_time": "1:06:27", "throughput": "4946.95", "total_tokens": 59360} +{"current_steps": 6, "total_steps": 1000, "loss": 2.714, "learning_rate": 6e-06, "epoch": 0.07465007776049767, "percentage": 0.6, "cur_time": "2024-08-20 08:36:15", "elapsed_time": "0:00:21", "remaining_time": "0:58:02", "throughput": "5243.60", "total_tokens": 110224} +{"current_steps": 9, "total_steps": 1000, "loss": 2.8832, "learning_rate": 9e-06, "epoch": 0.1119751166407465, "percentage": 0.9, "cur_time": "2024-08-20 08:36:24", "elapsed_time": "0:00:29", "remaining_time": "0:55:00", "throughput": "5419.06", "total_tokens": 162416} +{"current_steps": 12, "total_steps": 1000, "loss": 2.8293, "learning_rate": 1.2e-05, "epoch": 0.14930015552099535, "percentage": 1.2, "cur_time": "2024-08-20 08:36:33", "elapsed_time": "0:00:39", "remaining_time": "0:53:46", "throughput": "5502.14", "total_tokens": 215648} +{"current_steps": 15, "total_steps": 1000, "loss": 2.7404, "learning_rate": 1.5e-05, "epoch": 0.18662519440124417, "percentage": 1.5, "cur_time": "2024-08-20 08:36:43", "elapsed_time": "0:00:48", "remaining_time": "0:53:06", "throughput": "5574.39", "total_tokens": 270528} +{"current_steps": 18, "total_steps": 1000, "loss": 2.7317, "learning_rate": 1.8e-05, "epoch": 0.223950233281493, "percentage": 1.8, "cur_time": "2024-08-20 08:36:51", "elapsed_time": "0:00:57", "remaining_time": "0:52:06", "throughput": "5600.24", "total_tokens": 320944} +{"current_steps": 21, "total_steps": 1000, "loss": 2.7431, "learning_rate": 2.1e-05, "epoch": 0.26127527216174184, "percentage": 2.1, "cur_time": "2024-08-20 08:37:01", "elapsed_time": "0:01:07", "remaining_time": "0:52:18", "throughput": "5611.03", "total_tokens": 377696} +{"current_steps": 24, "total_steps": 1000, "loss": 2.5594, "learning_rate": 2.4e-05, "epoch": 0.2986003110419907, "percentage": 2.4, "cur_time": "2024-08-20 08:37:10", "elapsed_time": "0:01:15", "remaining_time": "0:51:21", "throughput": "5649.05", "total_tokens": 428048} +{"current_steps": 27, "total_steps": 1000, "loss": 2.7774, "learning_rate": 2.7000000000000002e-05, "epoch": 0.3359253499222395, "percentage": 2.7, "cur_time": "2024-08-20 08:37:20", "elapsed_time": "0:01:25", "remaining_time": "0:51:23", "throughput": "5675.71", "total_tokens": 485680} +{"current_steps": 30, "total_steps": 1000, "loss": 2.7325, "learning_rate": 3e-05, "epoch": 0.37325038880248834, "percentage": 3.0, "cur_time": "2024-08-20 08:37:29", "elapsed_time": "0:01:35", "remaining_time": "0:51:16", "throughput": "5662.17", "total_tokens": 538704} +{"current_steps": 33, "total_steps": 1000, "loss": 2.6304, "learning_rate": 3.3e-05, "epoch": 0.4105754276827372, "percentage": 3.3, "cur_time": "2024-08-20 08:37:39", "elapsed_time": "0:01:45", "remaining_time": "0:51:22", "throughput": "5609.36", "total_tokens": 590112} +{"current_steps": 36, "total_steps": 1000, "loss": 2.6643, "learning_rate": 3.6e-05, "epoch": 0.447900466562986, "percentage": 3.6, "cur_time": "2024-08-20 08:37:48", "elapsed_time": "0:01:54", "remaining_time": "0:51:00", "throughput": "5646.52", "total_tokens": 645360} +{"current_steps": 39, "total_steps": 1000, "loss": 2.5223, "learning_rate": 3.9000000000000006e-05, "epoch": 0.48522550544323484, "percentage": 3.9, "cur_time": "2024-08-20 08:37:58", "elapsed_time": "0:02:04", "remaining_time": "0:50:55", "throughput": "5642.02", "total_tokens": 699664} +{"current_steps": 42, "total_steps": 1000, "loss": 2.3975, "learning_rate": 4.2e-05, "epoch": 0.5225505443234837, "percentage": 4.2, "cur_time": "2024-08-20 08:38:08", "elapsed_time": "0:02:13", "remaining_time": "0:50:50", "throughput": "5628.15", "total_tokens": 752640} +{"current_steps": 45, "total_steps": 1000, "loss": 2.3851, "learning_rate": 4.5e-05, "epoch": 0.5598755832037325, "percentage": 4.5, "cur_time": "2024-08-20 08:38:18", "elapsed_time": "0:02:24", "remaining_time": "0:50:57", "throughput": "5618.36", "total_tokens": 809488} +{"current_steps": 48, "total_steps": 1000, "loss": 2.1509, "learning_rate": 4.8e-05, "epoch": 0.5972006220839814, "percentage": 4.8, "cur_time": "2024-08-20 08:38:29", "elapsed_time": "0:02:34", "remaining_time": "0:51:02", "throughput": "5607.81", "total_tokens": 866016} +{"current_steps": 51, "total_steps": 1000, "loss": 1.9498, "learning_rate": 5.1000000000000006e-05, "epoch": 0.6345256609642301, "percentage": 5.1, "cur_time": "2024-08-20 08:38:39", "elapsed_time": "0:02:44", "remaining_time": "0:51:07", "throughput": "5593.55", "total_tokens": 922160} +{"current_steps": 54, "total_steps": 1000, "loss": 1.863, "learning_rate": 5.4000000000000005e-05, "epoch": 0.671850699844479, "percentage": 5.4, "cur_time": "2024-08-20 08:38:49", "elapsed_time": "0:02:54", "remaining_time": "0:50:54", "throughput": "5599.15", "total_tokens": 976400} +{"current_steps": 57, "total_steps": 1000, "loss": 1.6358, "learning_rate": 5.6999999999999996e-05, "epoch": 0.7091757387247278, "percentage": 5.7, "cur_time": "2024-08-20 08:38:58", "elapsed_time": "0:03:03", "remaining_time": "0:50:41", "throughput": "5596.43", "total_tokens": 1028864} +{"current_steps": 60, "total_steps": 1000, "loss": 1.6125, "learning_rate": 6e-05, "epoch": 0.7465007776049767, "percentage": 6.0, "cur_time": "2024-08-20 08:39:08", "elapsed_time": "0:03:13", "remaining_time": "0:50:35", "throughput": "5589.05", "total_tokens": 1082896} +{"current_steps": 63, "total_steps": 1000, "loss": 1.7412, "learning_rate": 6.3e-05, "epoch": 0.7838258164852255, "percentage": 6.3, "cur_time": "2024-08-20 08:39:18", "elapsed_time": "0:03:23", "remaining_time": "0:50:28", "throughput": "5608.43", "total_tokens": 1141856} +{"current_steps": 66, "total_steps": 1000, "loss": 1.6413, "learning_rate": 6.6e-05, "epoch": 0.8211508553654744, "percentage": 6.6, "cur_time": "2024-08-20 08:39:28", "elapsed_time": "0:03:33", "remaining_time": "0:50:23", "throughput": "5602.40", "total_tokens": 1196976} +{"current_steps": 69, "total_steps": 1000, "loss": 1.6965, "learning_rate": 6.9e-05, "epoch": 0.8584758942457231, "percentage": 6.9, "cur_time": "2024-08-20 08:39:38", "elapsed_time": "0:03:43", "remaining_time": "0:50:19", "throughput": "5583.44", "total_tokens": 1249696} +{"current_steps": 72, "total_steps": 1000, "loss": 1.623, "learning_rate": 7.2e-05, "epoch": 0.895800933125972, "percentage": 7.2, "cur_time": "2024-08-20 08:39:48", "elapsed_time": "0:03:53", "remaining_time": "0:50:12", "throughput": "5582.67", "total_tokens": 1304880} +{"current_steps": 75, "total_steps": 1000, "loss": 1.5551, "learning_rate": 7.500000000000001e-05, "epoch": 0.9331259720062208, "percentage": 7.5, "cur_time": "2024-08-20 08:39:58", "elapsed_time": "0:04:03", "remaining_time": "0:50:02", "throughput": "5585.72", "total_tokens": 1359776} +{"current_steps": 78, "total_steps": 1000, "loss": 1.5815, "learning_rate": 7.800000000000001e-05, "epoch": 0.9704510108864697, "percentage": 7.8, "cur_time": "2024-08-20 08:40:08", "elapsed_time": "0:04:13", "remaining_time": "0:50:01", "throughput": "5575.52", "total_tokens": 1415584} +{"current_steps": 81, "total_steps": 1000, "loss": 1.5404, "learning_rate": 8.1e-05, "epoch": 1.0077760497667185, "percentage": 8.1, "cur_time": "2024-08-20 08:40:17", "elapsed_time": "0:04:23", "remaining_time": "0:49:47", "throughput": "5577.85", "total_tokens": 1468720} +{"current_steps": 84, "total_steps": 1000, "loss": 1.5862, "learning_rate": 8.4e-05, "epoch": 1.0451010886469674, "percentage": 8.4, "cur_time": "2024-08-20 08:40:27", "elapsed_time": "0:04:33", "remaining_time": "0:49:39", "throughput": "5585.20", "total_tokens": 1525872} +{"current_steps": 87, "total_steps": 1000, "loss": 1.6087, "learning_rate": 8.7e-05, "epoch": 1.0824261275272162, "percentage": 8.7, "cur_time": "2024-08-20 08:40:36", "elapsed_time": "0:04:42", "remaining_time": "0:49:21", "throughput": "5589.41", "total_tokens": 1577456} +{"current_steps": 90, "total_steps": 1000, "loss": 1.5819, "learning_rate": 9e-05, "epoch": 1.119751166407465, "percentage": 9.0, "cur_time": "2024-08-20 08:40:46", "elapsed_time": "0:04:51", "remaining_time": "0:49:06", "throughput": "5597.27", "total_tokens": 1631152} +{"current_steps": 93, "total_steps": 1000, "loss": 1.6023, "learning_rate": 9.300000000000001e-05, "epoch": 1.157076205287714, "percentage": 9.3, "cur_time": "2024-08-20 08:40:57", "elapsed_time": "0:05:02", "remaining_time": "0:49:09", "throughput": "5594.67", "total_tokens": 1692128} +{"current_steps": 96, "total_steps": 1000, "loss": 1.5937, "learning_rate": 9.6e-05, "epoch": 1.1944012441679628, "percentage": 9.6, "cur_time": "2024-08-20 08:41:07", "elapsed_time": "0:05:12", "remaining_time": "0:49:04", "throughput": "5577.97", "total_tokens": 1744368} +{"current_steps": 99, "total_steps": 1000, "loss": 1.588, "learning_rate": 9.900000000000001e-05, "epoch": 1.2317262830482114, "percentage": 9.9, "cur_time": "2024-08-20 08:41:16", "elapsed_time": "0:05:21", "remaining_time": "0:48:48", "throughput": "5580.32", "total_tokens": 1795904} +{"current_steps": 102, "total_steps": 1000, "loss": 1.5538, "learning_rate": 9.999878153526974e-05, "epoch": 1.2690513219284603, "percentage": 10.2, "cur_time": "2024-08-20 08:41:26", "elapsed_time": "0:05:31", "remaining_time": "0:48:42", "throughput": "5579.08", "total_tokens": 1851744} +{"current_steps": 105, "total_steps": 1000, "loss": 1.5003, "learning_rate": 9.999238475781957e-05, "epoch": 1.3063763608087091, "percentage": 10.5, "cur_time": "2024-08-20 08:41:36", "elapsed_time": "0:05:41", "remaining_time": "0:48:34", "throughput": "5570.98", "total_tokens": 1904912} +{"current_steps": 108, "total_steps": 1000, "loss": 1.5044, "learning_rate": 9.998050575201771e-05, "epoch": 1.343701399688958, "percentage": 10.8, "cur_time": "2024-08-20 08:41:45", "elapsed_time": "0:05:51", "remaining_time": "0:48:20", "throughput": "5571.17", "total_tokens": 1956592} +{"current_steps": 111, "total_steps": 1000, "loss": 1.5709, "learning_rate": 9.996314582053106e-05, "epoch": 1.3810264385692068, "percentage": 11.1, "cur_time": "2024-08-20 08:41:54", "elapsed_time": "0:06:00", "remaining_time": "0:48:04", "throughput": "5576.30", "total_tokens": 2008672} +{"current_steps": 114, "total_steps": 1000, "loss": 1.5742, "learning_rate": 9.99403068670717e-05, "epoch": 1.4183514774494557, "percentage": 11.4, "cur_time": "2024-08-20 08:42:04", "elapsed_time": "0:06:09", "remaining_time": "0:47:51", "throughput": "5584.08", "total_tokens": 2063056} +{"current_steps": 117, "total_steps": 1000, "loss": 1.4958, "learning_rate": 9.991199139618827e-05, "epoch": 1.4556765163297045, "percentage": 11.7, "cur_time": "2024-08-20 08:42:14", "elapsed_time": "0:06:20", "remaining_time": "0:47:49", "throughput": "5574.98", "total_tokens": 2120000} +{"current_steps": 120, "total_steps": 1000, "loss": 1.5715, "learning_rate": 9.987820251299122e-05, "epoch": 1.4930015552099534, "percentage": 12.0, "cur_time": "2024-08-20 08:42:24", "elapsed_time": "0:06:29", "remaining_time": "0:47:38", "throughput": "5576.97", "total_tokens": 2173760} +{"current_steps": 123, "total_steps": 1000, "loss": 1.5582, "learning_rate": 9.983894392281237e-05, "epoch": 1.5303265940902022, "percentage": 12.3, "cur_time": "2024-08-20 08:42:33", "elapsed_time": "0:06:39", "remaining_time": "0:47:25", "throughput": "5584.57", "total_tokens": 2228944} +{"current_steps": 126, "total_steps": 1000, "loss": 1.4904, "learning_rate": 9.979421993079852e-05, "epoch": 1.5676516329704508, "percentage": 12.6, "cur_time": "2024-08-20 08:42:43", "elapsed_time": "0:06:48", "remaining_time": "0:47:13", "throughput": "5582.50", "total_tokens": 2280544} +{"current_steps": 129, "total_steps": 1000, "loss": 1.4941, "learning_rate": 9.974403544143941e-05, "epoch": 1.6049766718506997, "percentage": 12.9, "cur_time": "2024-08-20 08:42:53", "elapsed_time": "0:06:58", "remaining_time": "0:47:08", "throughput": "5579.11", "total_tokens": 2337536} +{"current_steps": 132, "total_steps": 1000, "loss": 1.5594, "learning_rate": 9.968839595802982e-05, "epoch": 1.6423017107309485, "percentage": 13.2, "cur_time": "2024-08-20 08:43:02", "elapsed_time": "0:07:08", "remaining_time": "0:46:54", "throughput": "5586.46", "total_tokens": 2391008} +{"current_steps": 135, "total_steps": 1000, "loss": 1.45, "learning_rate": 9.962730758206611e-05, "epoch": 1.6796267496111974, "percentage": 13.5, "cur_time": "2024-08-20 08:43:12", "elapsed_time": "0:07:18", "remaining_time": "0:46:46", "throughput": "5581.98", "total_tokens": 2445312} +{"current_steps": 138, "total_steps": 1000, "loss": 1.6545, "learning_rate": 9.956077701257709e-05, "epoch": 1.7169517884914463, "percentage": 13.8, "cur_time": "2024-08-20 08:43:21", "elapsed_time": "0:07:26", "remaining_time": "0:46:32", "throughput": "5583.17", "total_tokens": 2495616} +{"current_steps": 141, "total_steps": 1000, "loss": 1.5083, "learning_rate": 9.948881154538945e-05, "epoch": 1.754276827371695, "percentage": 14.1, "cur_time": "2024-08-20 08:43:31", "elapsed_time": "0:07:36", "remaining_time": "0:46:22", "throughput": "5589.86", "total_tokens": 2553136} +{"current_steps": 144, "total_steps": 1000, "loss": 1.511, "learning_rate": 9.941141907232765e-05, "epoch": 1.791601866251944, "percentage": 14.4, "cur_time": "2024-08-20 08:43:40", "elapsed_time": "0:07:45", "remaining_time": "0:46:09", "throughput": "5596.94", "total_tokens": 2607888} +{"current_steps": 147, "total_steps": 1000, "loss": 1.4864, "learning_rate": 9.932860808034848e-05, "epoch": 1.8289269051321928, "percentage": 14.7, "cur_time": "2024-08-20 08:43:50", "elapsed_time": "0:07:55", "remaining_time": "0:45:59", "throughput": "5599.55", "total_tokens": 2662928} +{"current_steps": 150, "total_steps": 1000, "loss": 1.4787, "learning_rate": 9.924038765061042e-05, "epoch": 1.8662519440124417, "percentage": 15.0, "cur_time": "2024-08-20 08:44:00", "elapsed_time": "0:08:06", "remaining_time": "0:45:54", "throughput": "5595.50", "total_tokens": 2719936} +{"current_steps": 153, "total_steps": 1000, "loss": 1.4957, "learning_rate": 9.914676745747772e-05, "epoch": 1.9035769828926905, "percentage": 15.3, "cur_time": "2024-08-20 08:44:10", "elapsed_time": "0:08:15", "remaining_time": "0:45:45", "throughput": "5596.03", "total_tokens": 2774928} +{"current_steps": 156, "total_steps": 1000, "loss": 1.5388, "learning_rate": 9.904775776745958e-05, "epoch": 1.9409020217729394, "percentage": 15.6, "cur_time": "2024-08-20 08:44:20", "elapsed_time": "0:08:25", "remaining_time": "0:45:35", "throughput": "5599.31", "total_tokens": 2831584} +{"current_steps": 159, "total_steps": 1000, "loss": 1.4156, "learning_rate": 9.894336943808426e-05, "epoch": 1.9782270606531882, "percentage": 15.9, "cur_time": "2024-08-20 08:44:30", "elapsed_time": "0:08:35", "remaining_time": "0:45:27", "throughput": "5596.92", "total_tokens": 2886528} +{"current_steps": 162, "total_steps": 1000, "loss": 1.4933, "learning_rate": 9.88336139167084e-05, "epoch": 2.015552099533437, "percentage": 16.2, "cur_time": "2024-08-20 08:44:40", "elapsed_time": "0:08:46", "remaining_time": "0:45:21", "throughput": "5595.60", "total_tokens": 2944048} +{"current_steps": 165, "total_steps": 1000, "loss": 1.4981, "learning_rate": 9.871850323926177e-05, "epoch": 2.052877138413686, "percentage": 16.5, "cur_time": "2024-08-20 08:44:49", "elapsed_time": "0:08:55", "remaining_time": "0:45:08", "throughput": "5598.76", "total_tokens": 2996624} +{"current_steps": 168, "total_steps": 1000, "loss": 1.5245, "learning_rate": 9.859805002892732e-05, "epoch": 2.0902021772939348, "percentage": 16.8, "cur_time": "2024-08-20 08:44:59", "elapsed_time": "0:09:05", "remaining_time": "0:44:59", "throughput": "5597.53", "total_tokens": 3050672} +{"current_steps": 171, "total_steps": 1000, "loss": 1.4546, "learning_rate": 9.847226749475695e-05, "epoch": 2.1275272161741836, "percentage": 17.1, "cur_time": "2024-08-20 08:45:08", "elapsed_time": "0:09:13", "remaining_time": "0:44:45", "throughput": "5600.08", "total_tokens": 3101648} +{"current_steps": 174, "total_steps": 1000, "loss": 1.4426, "learning_rate": 9.834116943022298e-05, "epoch": 2.1648522550544325, "percentage": 17.4, "cur_time": "2024-08-20 08:45:18", "elapsed_time": "0:09:23", "remaining_time": "0:44:37", "throughput": "5601.64", "total_tokens": 3158912} +{"current_steps": 177, "total_steps": 1000, "loss": 1.4767, "learning_rate": 9.820477021170551e-05, "epoch": 2.2021772939346813, "percentage": 17.7, "cur_time": "2024-08-20 08:45:28", "elapsed_time": "0:09:34", "remaining_time": "0:44:29", "throughput": "5599.77", "total_tokens": 3215280} +{"current_steps": 180, "total_steps": 1000, "loss": 1.4652, "learning_rate": 9.806308479691595e-05, "epoch": 2.23950233281493, "percentage": 18.0, "cur_time": "2024-08-20 08:45:38", "elapsed_time": "0:09:44", "remaining_time": "0:44:20", "throughput": "5598.91", "total_tokens": 3270144} +{"current_steps": 183, "total_steps": 1000, "loss": 1.451, "learning_rate": 9.791612872325667e-05, "epoch": 2.276827371695179, "percentage": 18.3, "cur_time": "2024-08-20 08:45:48", "elapsed_time": "0:09:54", "remaining_time": "0:44:12", "throughput": "5598.33", "total_tokens": 3326128} +{"current_steps": 186, "total_steps": 1000, "loss": 1.4005, "learning_rate": 9.776391810611718e-05, "epoch": 2.314152410575428, "percentage": 18.6, "cur_time": "2024-08-20 08:45:59", "elapsed_time": "0:10:04", "remaining_time": "0:44:05", "throughput": "5595.52", "total_tokens": 3382096} +{"current_steps": 189, "total_steps": 1000, "loss": 1.5281, "learning_rate": 9.760646963710694e-05, "epoch": 2.3514774494556763, "percentage": 18.9, "cur_time": "2024-08-20 08:46:07", "elapsed_time": "0:10:13", "remaining_time": "0:43:51", "throughput": "5597.45", "total_tokens": 3432624} +{"current_steps": 192, "total_steps": 1000, "loss": 1.4811, "learning_rate": 9.744380058222483e-05, "epoch": 2.3888024883359256, "percentage": 19.2, "cur_time": "2024-08-20 08:46:17", "elapsed_time": "0:10:22", "remaining_time": "0:43:40", "throughput": "5596.84", "total_tokens": 3485216} +{"current_steps": 195, "total_steps": 1000, "loss": 1.4439, "learning_rate": 9.727592877996585e-05, "epoch": 2.426127527216174, "percentage": 19.5, "cur_time": "2024-08-20 08:46:26", "elapsed_time": "0:10:32", "remaining_time": "0:43:29", "throughput": "5594.71", "total_tokens": 3536224} +{"current_steps": 198, "total_steps": 1000, "loss": 1.4742, "learning_rate": 9.710287263936484e-05, "epoch": 2.463452566096423, "percentage": 19.8, "cur_time": "2024-08-20 08:46:35", "elapsed_time": "0:10:41", "remaining_time": "0:43:17", "throughput": "5595.92", "total_tokens": 3588128} +{"current_steps": 201, "total_steps": 1000, "loss": 1.4419, "learning_rate": 9.69246511379778e-05, "epoch": 2.5007776049766717, "percentage": 20.1, "cur_time": "2024-08-20 08:46:46", "elapsed_time": "0:10:51", "remaining_time": "0:43:11", "throughput": "5589.82", "total_tokens": 3644256} +{"current_steps": 204, "total_steps": 1000, "loss": 1.4283, "learning_rate": 9.674128381980072e-05, "epoch": 2.5381026438569205, "percentage": 20.4, "cur_time": "2024-08-20 08:46:55", "elapsed_time": "0:11:01", "remaining_time": "0:43:00", "throughput": "5592.11", "total_tokens": 3697680} +{"current_steps": 207, "total_steps": 1000, "loss": 1.4107, "learning_rate": 9.655279079312642e-05, "epoch": 2.5754276827371694, "percentage": 20.7, "cur_time": "2024-08-20 08:47:06", "elapsed_time": "0:11:11", "remaining_time": "0:42:53", "throughput": "5591.38", "total_tokens": 3756608} +{"current_steps": 210, "total_steps": 1000, "loss": 1.4162, "learning_rate": 9.635919272833938e-05, "epoch": 2.6127527216174182, "percentage": 21.0, "cur_time": "2024-08-20 08:47:15", "elapsed_time": "0:11:21", "remaining_time": "0:42:42", "throughput": "5595.06", "total_tokens": 3811632} +{"current_steps": 213, "total_steps": 1000, "loss": 1.5205, "learning_rate": 9.616051085564906e-05, "epoch": 2.650077760497667, "percentage": 21.3, "cur_time": "2024-08-20 08:47:26", "elapsed_time": "0:11:31", "remaining_time": "0:42:36", "throughput": "5592.69", "total_tokens": 3870112} +{"current_steps": 216, "total_steps": 1000, "loss": 1.5375, "learning_rate": 9.595676696276172e-05, "epoch": 2.687402799377916, "percentage": 21.6, "cur_time": "2024-08-20 08:47:37", "elapsed_time": "0:11:42", "remaining_time": "0:42:30", "throughput": "5586.63", "total_tokens": 3926352} +{"current_steps": 219, "total_steps": 1000, "loss": 1.4106, "learning_rate": 9.574798339249125e-05, "epoch": 2.724727838258165, "percentage": 21.9, "cur_time": "2024-08-20 08:47:47", "elapsed_time": "0:11:52", "remaining_time": "0:42:20", "throughput": "5587.85", "total_tokens": 3980768} +{"current_steps": 222, "total_steps": 1000, "loss": 1.4377, "learning_rate": 9.553418304030886e-05, "epoch": 2.7620528771384136, "percentage": 22.2, "cur_time": "2024-08-20 08:47:55", "elapsed_time": "0:12:00", "remaining_time": "0:42:06", "throughput": "5589.39", "total_tokens": 4029792} +{"current_steps": 225, "total_steps": 1000, "loss": 1.4638, "learning_rate": 9.53153893518325e-05, "epoch": 2.7993779160186625, "percentage": 22.5, "cur_time": "2024-08-20 08:48:05", "elapsed_time": "0:12:10", "remaining_time": "0:41:57", "throughput": "5592.13", "total_tokens": 4087248} +{"current_steps": 228, "total_steps": 1000, "loss": 1.469, "learning_rate": 9.50916263202557e-05, "epoch": 2.8367029548989113, "percentage": 22.8, "cur_time": "2024-08-20 08:48:14", "elapsed_time": "0:12:20", "remaining_time": "0:41:46", "throughput": "5597.20", "total_tokens": 4142912} +{"current_steps": 231, "total_steps": 1000, "loss": 1.4436, "learning_rate": 9.486291848371643e-05, "epoch": 2.87402799377916, "percentage": 23.1, "cur_time": "2024-08-20 08:48:24", "elapsed_time": "0:12:29", "remaining_time": "0:41:35", "throughput": "5601.04", "total_tokens": 4198272} +{"current_steps": 234, "total_steps": 1000, "loss": 1.4491, "learning_rate": 9.462929092260628e-05, "epoch": 2.911353032659409, "percentage": 23.4, "cur_time": "2024-08-20 08:48:33", "elapsed_time": "0:12:39", "remaining_time": "0:41:25", "throughput": "5601.17", "total_tokens": 4252528} +{"current_steps": 237, "total_steps": 1000, "loss": 1.4245, "learning_rate": 9.439076925682006e-05, "epoch": 2.948678071539658, "percentage": 23.7, "cur_time": "2024-08-20 08:48:43", "elapsed_time": "0:12:48", "remaining_time": "0:41:15", "throughput": "5599.01", "total_tokens": 4305472} +{"current_steps": 240, "total_steps": 1000, "loss": 1.5656, "learning_rate": 9.414737964294636e-05, "epoch": 2.9860031104199067, "percentage": 24.0, "cur_time": "2024-08-20 08:48:53", "elapsed_time": "0:12:58", "remaining_time": "0:41:05", "throughput": "5600.59", "total_tokens": 4360608} +{"current_steps": 243, "total_steps": 1000, "loss": 1.362, "learning_rate": 9.389914877139903e-05, "epoch": 3.0233281493001556, "percentage": 24.3, "cur_time": "2024-08-20 08:49:04", "elapsed_time": "0:13:09", "remaining_time": "0:40:59", "throughput": "5596.50", "total_tokens": 4418496} +{"current_steps": 246, "total_steps": 1000, "loss": 1.4078, "learning_rate": 9.364610386349049e-05, "epoch": 3.0606531881804044, "percentage": 24.6, "cur_time": "2024-08-20 08:49:12", "elapsed_time": "0:13:17", "remaining_time": "0:40:45", "throughput": "5603.43", "total_tokens": 4471104} +{"current_steps": 249, "total_steps": 1000, "loss": 1.3074, "learning_rate": 9.338827266844644e-05, "epoch": 3.0979782270606533, "percentage": 24.9, "cur_time": "2024-08-20 08:49:21", "elapsed_time": "0:13:27", "remaining_time": "0:40:34", "throughput": "5602.91", "total_tokens": 4522432} +{"current_steps": 252, "total_steps": 1000, "loss": 1.379, "learning_rate": 9.312568346036288e-05, "epoch": 3.135303265940902, "percentage": 25.2, "cur_time": "2024-08-20 08:49:30", "elapsed_time": "0:13:36", "remaining_time": "0:40:23", "throughput": "5606.86", "total_tokens": 4577216} +{"current_steps": 255, "total_steps": 1000, "loss": 1.3311, "learning_rate": 9.285836503510562e-05, "epoch": 3.172628304821151, "percentage": 25.5, "cur_time": "2024-08-20 08:49:40", "elapsed_time": "0:13:45", "remaining_time": "0:40:11", "throughput": "5607.04", "total_tokens": 4628832} +{"current_steps": 258, "total_steps": 1000, "loss": 1.3993, "learning_rate": 9.258634670715238e-05, "epoch": 3.2099533437014, "percentage": 25.8, "cur_time": "2024-08-20 08:49:49", "elapsed_time": "0:13:55", "remaining_time": "0:40:01", "throughput": "5608.64", "total_tokens": 4683312} +{"current_steps": 261, "total_steps": 1000, "loss": 1.3511, "learning_rate": 9.230965830637821e-05, "epoch": 3.2472783825816487, "percentage": 26.1, "cur_time": "2024-08-20 08:50:00", "elapsed_time": "0:14:06", "remaining_time": "0:39:55", "throughput": "5601.29", "total_tokens": 4739792} +{"current_steps": 264, "total_steps": 1000, "loss": 1.3009, "learning_rate": 9.202833017478422e-05, "epoch": 3.2846034214618975, "percentage": 26.4, "cur_time": "2024-08-20 08:50:10", "elapsed_time": "0:14:15", "remaining_time": "0:39:44", "throughput": "5601.46", "total_tokens": 4791312} +{"current_steps": 267, "total_steps": 1000, "loss": 1.3149, "learning_rate": 9.174239316317033e-05, "epoch": 3.3219284603421464, "percentage": 26.7, "cur_time": "2024-08-20 08:50:19", "elapsed_time": "0:14:24", "remaining_time": "0:39:34", "throughput": "5603.20", "total_tokens": 4845744} +{"current_steps": 270, "total_steps": 1000, "loss": 1.3126, "learning_rate": 9.145187862775209e-05, "epoch": 3.359253499222395, "percentage": 27.0, "cur_time": "2024-08-20 08:50:28", "elapsed_time": "0:14:33", "remaining_time": "0:39:22", "throughput": "5605.25", "total_tokens": 4897472} +{"current_steps": 273, "total_steps": 1000, "loss": 1.341, "learning_rate": 9.11568184267221e-05, "epoch": 3.396578538102644, "percentage": 27.3, "cur_time": "2024-08-20 08:50:38", "elapsed_time": "0:14:43", "remaining_time": "0:39:12", "throughput": "5608.11", "total_tokens": 4954272} +{"current_steps": 276, "total_steps": 1000, "loss": 1.2816, "learning_rate": 9.085724491675642e-05, "epoch": 3.4339035769828925, "percentage": 27.6, "cur_time": "2024-08-20 08:50:48", "elapsed_time": "0:14:53", "remaining_time": "0:39:04", "throughput": "5606.66", "total_tokens": 5010272} +{"current_steps": 279, "total_steps": 1000, "loss": 1.3563, "learning_rate": 9.055319094946633e-05, "epoch": 3.4712286158631414, "percentage": 27.9, "cur_time": "2024-08-20 08:50:58", "elapsed_time": "0:15:04", "remaining_time": "0:38:56", "throughput": "5603.97", "total_tokens": 5066032} +{"current_steps": 282, "total_steps": 1000, "loss": 1.3326, "learning_rate": 9.02446898677957e-05, "epoch": 3.50855365474339, "percentage": 28.2, "cur_time": "2024-08-20 08:51:07", "elapsed_time": "0:15:12", "remaining_time": "0:38:42", "throughput": "5606.82", "total_tokens": 5115488} +{"current_steps": 285, "total_steps": 1000, "loss": 1.4707, "learning_rate": 8.993177550236464e-05, "epoch": 3.545878693623639, "percentage": 28.5, "cur_time": "2024-08-20 08:51:18", "elapsed_time": "0:15:23", "remaining_time": "0:38:37", "throughput": "5602.16", "total_tokens": 5174736} +{"current_steps": 288, "total_steps": 1000, "loss": 1.2941, "learning_rate": 8.961448216775954e-05, "epoch": 3.583203732503888, "percentage": 28.8, "cur_time": "2024-08-20 08:51:28", "elapsed_time": "0:15:33", "remaining_time": "0:38:27", "throughput": "5602.41", "total_tokens": 5230016} +{"current_steps": 291, "total_steps": 1000, "loss": 1.3435, "learning_rate": 8.92928446587701e-05, "epoch": 3.6205287713841368, "percentage": 29.1, "cur_time": "2024-08-20 08:51:38", "elapsed_time": "0:15:43", "remaining_time": "0:38:19", "throughput": "5604.35", "total_tokens": 5289728} +{"current_steps": 294, "total_steps": 1000, "loss": 1.3954, "learning_rate": 8.896689824657372e-05, "epoch": 3.6578538102643856, "percentage": 29.4, "cur_time": "2024-08-20 08:51:47", "elapsed_time": "0:15:52", "remaining_time": "0:38:08", "throughput": "5608.39", "total_tokens": 5344176} +{"current_steps": 297, "total_steps": 1000, "loss": 1.2821, "learning_rate": 8.863667867486756e-05, "epoch": 3.6951788491446345, "percentage": 29.7, "cur_time": "2024-08-20 08:51:56", "elapsed_time": "0:16:02", "remaining_time": "0:37:57", "throughput": "5607.93", "total_tokens": 5396144} +{"current_steps": 300, "total_steps": 1000, "loss": 1.3893, "learning_rate": 8.83022221559489e-05, "epoch": 3.7325038880248833, "percentage": 30.0, "cur_time": "2024-08-20 08:52:06", "elapsed_time": "0:16:11", "remaining_time": "0:37:46", "throughput": "5608.93", "total_tokens": 5448672} +{"current_steps": 303, "total_steps": 1000, "loss": 1.3796, "learning_rate": 8.796356536674403e-05, "epoch": 3.769828926905132, "percentage": 30.3, "cur_time": "2024-08-20 08:52:15", "elapsed_time": "0:16:21", "remaining_time": "0:37:37", "throughput": "5609.14", "total_tokens": 5503696} +{"current_steps": 306, "total_steps": 1000, "loss": 1.4454, "learning_rate": 8.762074544478623e-05, "epoch": 3.807153965785381, "percentage": 30.6, "cur_time": "2024-08-20 08:52:25", "elapsed_time": "0:16:31", "remaining_time": "0:37:27", "throughput": "5607.90", "total_tokens": 5557888} +{"current_steps": 309, "total_steps": 1000, "loss": 1.3422, "learning_rate": 8.727379998414311e-05, "epoch": 3.84447900466563, "percentage": 30.9, "cur_time": "2024-08-20 08:52:34", "elapsed_time": "0:16:39", "remaining_time": "0:37:15", "throughput": "5610.13", "total_tokens": 5609232} +{"current_steps": 312, "total_steps": 1000, "loss": 1.3338, "learning_rate": 8.692276703129421e-05, "epoch": 3.8818040435458787, "percentage": 31.2, "cur_time": "2024-08-20 08:52:44", "elapsed_time": "0:16:49", "remaining_time": "0:37:06", "throughput": "5611.81", "total_tokens": 5667072} +{"current_steps": 315, "total_steps": 1000, "loss": 1.3997, "learning_rate": 8.656768508095853e-05, "epoch": 3.9191290824261276, "percentage": 31.5, "cur_time": "2024-08-20 08:52:54", "elapsed_time": "0:17:00", "remaining_time": "0:36:58", "throughput": "5609.19", "total_tokens": 5722672} +{"current_steps": 318, "total_steps": 1000, "loss": 1.434, "learning_rate": 8.620859307187339e-05, "epoch": 3.9564541213063764, "percentage": 31.8, "cur_time": "2024-08-20 08:53:04", "elapsed_time": "0:17:09", "remaining_time": "0:36:48", "throughput": "5612.36", "total_tokens": 5780096} +{"current_steps": 321, "total_steps": 1000, "loss": 1.3018, "learning_rate": 8.584553038252414e-05, "epoch": 3.9937791601866253, "percentage": 32.1, "cur_time": "2024-08-20 08:53:15", "elapsed_time": "0:17:20", "remaining_time": "0:36:40", "throughput": "5608.36", "total_tokens": 5835632} +{"current_steps": 324, "total_steps": 1000, "loss": 1.2729, "learning_rate": 8.547853682682604e-05, "epoch": 4.031104199066874, "percentage": 32.4, "cur_time": "2024-08-20 08:53:25", "elapsed_time": "0:17:30", "remaining_time": "0:36:31", "throughput": "5608.93", "total_tokens": 5891616} +{"current_steps": 327, "total_steps": 1000, "loss": 1.1823, "learning_rate": 8.510765264975813e-05, "epoch": 4.0684292379471225, "percentage": 32.7, "cur_time": "2024-08-20 08:53:34", "elapsed_time": "0:17:39", "remaining_time": "0:36:21", "throughput": "5606.11", "total_tokens": 5942400} +{"current_steps": 330, "total_steps": 1000, "loss": 1.4483, "learning_rate": 8.473291852294987e-05, "epoch": 4.105754276827372, "percentage": 33.0, "cur_time": "2024-08-20 08:53:43", "elapsed_time": "0:17:49", "remaining_time": "0:36:11", "throughput": "5608.80", "total_tokens": 5997728} +{"current_steps": 333, "total_steps": 1000, "loss": 1.1807, "learning_rate": 8.435437554022115e-05, "epoch": 4.14307931570762, "percentage": 33.3, "cur_time": "2024-08-20 08:53:52", "elapsed_time": "0:17:58", "remaining_time": "0:35:59", "throughput": "5610.05", "total_tokens": 6049312} +{"current_steps": 336, "total_steps": 1000, "loss": 1.2298, "learning_rate": 8.397206521307584e-05, "epoch": 4.1804043545878695, "percentage": 33.6, "cur_time": "2024-08-20 08:54:03", "elapsed_time": "0:18:08", "remaining_time": "0:35:50", "throughput": "5612.01", "total_tokens": 6108288} +{"current_steps": 339, "total_steps": 1000, "loss": 1.1629, "learning_rate": 8.358602946614951e-05, "epoch": 4.217729393468118, "percentage": 33.9, "cur_time": "2024-08-20 08:54:11", "elapsed_time": "0:18:17", "remaining_time": "0:35:39", "throughput": "5613.60", "total_tokens": 6160144} +{"current_steps": 342, "total_steps": 1000, "loss": 1.2641, "learning_rate": 8.319631063261209e-05, "epoch": 4.255054432348367, "percentage": 34.2, "cur_time": "2024-08-20 08:54:21", "elapsed_time": "0:18:27", "remaining_time": "0:35:30", "throughput": "5614.25", "total_tokens": 6216672} +{"current_steps": 345, "total_steps": 1000, "loss": 1.2478, "learning_rate": 8.280295144952536e-05, "epoch": 4.292379471228616, "percentage": 34.5, "cur_time": "2024-08-20 08:54:31", "elapsed_time": "0:18:36", "remaining_time": "0:35:20", "throughput": "5614.50", "total_tokens": 6271168} +{"current_steps": 348, "total_steps": 1000, "loss": 1.2675, "learning_rate": 8.240599505315655e-05, "epoch": 4.329704510108865, "percentage": 34.8, "cur_time": "2024-08-20 08:54:41", "elapsed_time": "0:18:46", "remaining_time": "0:35:10", "throughput": "5616.26", "total_tokens": 6326992} +{"current_steps": 351, "total_steps": 1000, "loss": 1.2551, "learning_rate": 8.200548497424778e-05, "epoch": 4.367029548989113, "percentage": 35.1, "cur_time": "2024-08-20 08:54:50", "elapsed_time": "0:18:55", "remaining_time": "0:34:59", "throughput": "5620.06", "total_tokens": 6380944} +{"current_steps": 354, "total_steps": 1000, "loss": 1.2011, "learning_rate": 8.160146513324254e-05, "epoch": 4.404354587869363, "percentage": 35.4, "cur_time": "2024-08-20 08:54:59", "elapsed_time": "0:19:05", "remaining_time": "0:34:49", "throughput": "5620.62", "total_tokens": 6436144} +{"current_steps": 357, "total_steps": 1000, "loss": 1.1243, "learning_rate": 8.119397983546932e-05, "epoch": 4.441679626749611, "percentage": 35.7, "cur_time": "2024-08-20 08:55:08", "elapsed_time": "0:19:13", "remaining_time": "0:34:38", "throughput": "5622.04", "total_tokens": 6487824} +{"current_steps": 360, "total_steps": 1000, "loss": 1.2775, "learning_rate": 8.07830737662829e-05, "epoch": 4.47900466562986, "percentage": 36.0, "cur_time": "2024-08-20 08:55:18", "elapsed_time": "0:19:23", "remaining_time": "0:34:28", "throughput": "5620.80", "total_tokens": 6540448} +{"current_steps": 363, "total_steps": 1000, "loss": 1.208, "learning_rate": 8.036879198616434e-05, "epoch": 4.516329704510109, "percentage": 36.3, "cur_time": "2024-08-20 08:55:27", "elapsed_time": "0:19:33", "remaining_time": "0:34:18", "throughput": "5620.04", "total_tokens": 6593248} +{"current_steps": 366, "total_steps": 1000, "loss": 1.1828, "learning_rate": 7.99511799257793e-05, "epoch": 4.553654743390358, "percentage": 36.6, "cur_time": "2024-08-20 08:55:38", "elapsed_time": "0:19:43", "remaining_time": "0:34:10", "throughput": "5616.65", "total_tokens": 6648976} +{"current_steps": 369, "total_steps": 1000, "loss": 1.2522, "learning_rate": 7.953028338099627e-05, "epoch": 4.590979782270606, "percentage": 36.9, "cur_time": "2024-08-20 08:55:47", "elapsed_time": "0:19:53", "remaining_time": "0:34:00", "throughput": "5618.55", "total_tokens": 6703440} +{"current_steps": 372, "total_steps": 1000, "loss": 1.2285, "learning_rate": 7.910614850786448e-05, "epoch": 4.628304821150856, "percentage": 37.2, "cur_time": "2024-08-20 08:55:56", "elapsed_time": "0:20:02", "remaining_time": "0:33:49", "throughput": "5619.62", "total_tokens": 6755360} +{"current_steps": 375, "total_steps": 1000, "loss": 1.2556, "learning_rate": 7.86788218175523e-05, "epoch": 4.665629860031104, "percentage": 37.5, "cur_time": "2024-08-20 08:56:06", "elapsed_time": "0:20:12", "remaining_time": "0:33:40", "throughput": "5619.45", "total_tokens": 6811440} +{"current_steps": 378, "total_steps": 1000, "loss": 1.3034, "learning_rate": 7.82483501712469e-05, "epoch": 4.7029548989113525, "percentage": 37.8, "cur_time": "2024-08-20 08:56:17", "elapsed_time": "0:20:23", "remaining_time": "0:33:32", "throughput": "5617.63", "total_tokens": 6870992} +{"current_steps": 381, "total_steps": 1000, "loss": 1.2352, "learning_rate": 7.781478077501525e-05, "epoch": 4.740279937791602, "percentage": 38.1, "cur_time": "2024-08-20 08:56:27", "elapsed_time": "0:20:33", "remaining_time": "0:33:23", "throughput": "5617.18", "total_tokens": 6927120} +{"current_steps": 384, "total_steps": 1000, "loss": 1.2768, "learning_rate": 7.737816117462752e-05, "epoch": 4.777604976671851, "percentage": 38.4, "cur_time": "2024-08-20 08:56:37", "elapsed_time": "0:20:42", "remaining_time": "0:33:13", "throughput": "5619.42", "total_tokens": 6983216} +{"current_steps": 387, "total_steps": 1000, "loss": 1.3318, "learning_rate": 7.693853925034315e-05, "epoch": 4.8149300155520995, "percentage": 38.7, "cur_time": "2024-08-20 08:56:46", "elapsed_time": "0:20:51", "remaining_time": "0:33:03", "throughput": "5624.43", "total_tokens": 7041648} +{"current_steps": 390, "total_steps": 1000, "loss": 1.2715, "learning_rate": 7.649596321166024e-05, "epoch": 4.852255054432348, "percentage": 39.0, "cur_time": "2024-08-20 08:56:55", "elapsed_time": "0:21:01", "remaining_time": "0:32:52", "throughput": "5625.04", "total_tokens": 7095104} +{"current_steps": 393, "total_steps": 1000, "loss": 1.1691, "learning_rate": 7.605048159202883e-05, "epoch": 4.889580093312597, "percentage": 39.3, "cur_time": "2024-08-20 08:57:04", "elapsed_time": "0:21:10", "remaining_time": "0:32:41", "throughput": "5626.04", "total_tokens": 7146512} +{"current_steps": 396, "total_steps": 1000, "loss": 1.2651, "learning_rate": 7.560214324352858e-05, "epoch": 4.926905132192846, "percentage": 39.6, "cur_time": "2024-08-20 08:57:14", "elapsed_time": "0:21:19", "remaining_time": "0:32:31", "throughput": "5626.16", "total_tokens": 7198704} +{"current_steps": 399, "total_steps": 1000, "loss": 1.3019, "learning_rate": 7.515099733151177e-05, "epoch": 4.964230171073095, "percentage": 39.9, "cur_time": "2024-08-20 08:57:24", "elapsed_time": "0:21:29", "remaining_time": "0:32:22", "throughput": "5626.49", "total_tokens": 7254608} +{"current_steps": 402, "total_steps": 1000, "loss": 1.2474, "learning_rate": 7.469709332921155e-05, "epoch": 5.001555209953343, "percentage": 40.2, "cur_time": "2024-08-20 08:57:34", "elapsed_time": "0:21:39", "remaining_time": "0:32:13", "throughput": "5625.70", "total_tokens": 7312224} +{"current_steps": 405, "total_steps": 1000, "loss": 1.1313, "learning_rate": 7.424048101231686e-05, "epoch": 5.038880248833593, "percentage": 40.5, "cur_time": "2024-08-20 08:57:44", "elapsed_time": "0:21:50", "remaining_time": "0:32:04", "throughput": "5621.80", "total_tokens": 7365456} +{"current_steps": 408, "total_steps": 1000, "loss": 1.265, "learning_rate": 7.378121045351378e-05, "epoch": 5.076205287713841, "percentage": 40.8, "cur_time": "2024-08-20 08:57:54", "elapsed_time": "0:21:59", "remaining_time": "0:31:55", "throughput": "5622.94", "total_tokens": 7421200} +{"current_steps": 411, "total_steps": 1000, "loss": 1.1876, "learning_rate": 7.331933201699457e-05, "epoch": 5.11353032659409, "percentage": 41.1, "cur_time": "2024-08-20 08:58:03", "elapsed_time": "0:22:09", "remaining_time": "0:31:44", "throughput": "5624.30", "total_tokens": 7474704} +{"current_steps": 414, "total_steps": 1000, "loss": 1.063, "learning_rate": 7.285489635293472e-05, "epoch": 5.150855365474339, "percentage": 41.4, "cur_time": "2024-08-20 08:58:13", "elapsed_time": "0:22:18", "remaining_time": "0:31:34", "throughput": "5623.42", "total_tokens": 7527856} +{"current_steps": 417, "total_steps": 1000, "loss": 1.1132, "learning_rate": 7.238795439193848e-05, "epoch": 5.188180404354588, "percentage": 41.7, "cur_time": "2024-08-20 08:58:23", "elapsed_time": "0:22:29", "remaining_time": "0:31:26", "throughput": "5622.53", "total_tokens": 7585040} +{"current_steps": 420, "total_steps": 1000, "loss": 1.1264, "learning_rate": 7.191855733945387e-05, "epoch": 5.2255054432348365, "percentage": 42.0, "cur_time": "2024-08-20 08:58:34", "elapsed_time": "0:22:39", "remaining_time": "0:31:17", "throughput": "5623.08", "total_tokens": 7643952} +{"current_steps": 423, "total_steps": 1000, "loss": 1.1316, "learning_rate": 7.14467566701573e-05, "epoch": 5.262830482115086, "percentage": 42.3, "cur_time": "2024-08-20 08:58:44", "elapsed_time": "0:22:49", "remaining_time": "0:31:08", "throughput": "5622.17", "total_tokens": 7699952} +{"current_steps": 426, "total_steps": 1000, "loss": 1.1275, "learning_rate": 7.097260412230886e-05, "epoch": 5.300155520995334, "percentage": 42.6, "cur_time": "2024-08-20 08:58:53", "elapsed_time": "0:22:58", "remaining_time": "0:30:58", "throughput": "5623.43", "total_tokens": 7754384} +{"current_steps": 429, "total_steps": 1000, "loss": 1.1736, "learning_rate": 7.049615169207864e-05, "epoch": 5.3374805598755835, "percentage": 42.9, "cur_time": "2024-08-20 08:59:03", "elapsed_time": "0:23:08", "remaining_time": "0:30:48", "throughput": "5625.36", "total_tokens": 7811760} +{"current_steps": 432, "total_steps": 1000, "loss": 1.1063, "learning_rate": 7.001745162784477e-05, "epoch": 5.374805598755832, "percentage": 43.2, "cur_time": "2024-08-20 08:59:12", "elapsed_time": "0:23:17", "remaining_time": "0:30:37", "throughput": "5628.21", "total_tokens": 7867376} +{"current_steps": 435, "total_steps": 1000, "loss": 1.0924, "learning_rate": 6.953655642446368e-05, "epoch": 5.412130637636081, "percentage": 43.5, "cur_time": "2024-08-20 08:59:22", "elapsed_time": "0:23:27", "remaining_time": "0:30:28", "throughput": "5627.34", "total_tokens": 7920736} +{"current_steps": 438, "total_steps": 1000, "loss": 1.1583, "learning_rate": 6.905351881751372e-05, "epoch": 5.44945567651633, "percentage": 43.8, "cur_time": "2024-08-20 08:59:31", "elapsed_time": "0:23:37", "remaining_time": "0:30:18", "throughput": "5627.48", "total_tokens": 7975088} +{"current_steps": 441, "total_steps": 1000, "loss": 1.0812, "learning_rate": 6.856839177751176e-05, "epoch": 5.486780715396579, "percentage": 44.1, "cur_time": "2024-08-20 08:59:41", "elapsed_time": "0:23:46", "remaining_time": "0:30:08", "throughput": "5627.69", "total_tokens": 8027328} +{"current_steps": 444, "total_steps": 1000, "loss": 1.0749, "learning_rate": 6.808122850410461e-05, "epoch": 5.524105754276827, "percentage": 44.4, "cur_time": "2024-08-20 08:59:51", "elapsed_time": "0:23:56", "remaining_time": "0:29:58", "throughput": "5626.94", "total_tokens": 8083392} +{"current_steps": 447, "total_steps": 1000, "loss": 1.1026, "learning_rate": 6.759208242023509e-05, "epoch": 5.561430793157077, "percentage": 44.7, "cur_time": "2024-08-20 08:59:59", "elapsed_time": "0:24:05", "remaining_time": "0:29:48", "throughput": "5628.35", "total_tokens": 8134800} +{"current_steps": 450, "total_steps": 1000, "loss": 1.092, "learning_rate": 6.710100716628344e-05, "epoch": 5.598755832037325, "percentage": 45.0, "cur_time": "2024-08-20 09:00:09", "elapsed_time": "0:24:15", "remaining_time": "0:29:38", "throughput": "5627.93", "total_tokens": 8188944} +{"current_steps": 453, "total_steps": 1000, "loss": 1.2196, "learning_rate": 6.660805659418516e-05, "epoch": 5.636080870917574, "percentage": 45.3, "cur_time": "2024-08-20 09:00:19", "elapsed_time": "0:24:25", "remaining_time": "0:29:29", "throughput": "5628.98", "total_tokens": 8248096} +{"current_steps": 456, "total_steps": 1000, "loss": 1.1435, "learning_rate": 6.611328476152557e-05, "epoch": 5.673405909797823, "percentage": 45.6, "cur_time": "2024-08-20 09:00:29", "elapsed_time": "0:24:35", "remaining_time": "0:29:19", "throughput": "5629.04", "total_tokens": 8302864} +{"current_steps": 459, "total_steps": 1000, "loss": 1.0955, "learning_rate": 6.561674592561163e-05, "epoch": 5.710730948678071, "percentage": 45.9, "cur_time": "2024-08-20 09:00:40", "elapsed_time": "0:24:46", "remaining_time": "0:29:11", "throughput": "5623.07", "total_tokens": 8356800} +{"current_steps": 462, "total_steps": 1000, "loss": 1.2012, "learning_rate": 6.511849453752223e-05, "epoch": 5.74805598755832, "percentage": 46.2, "cur_time": "2024-08-20 09:00:50", "elapsed_time": "0:24:55", "remaining_time": "0:29:01", "throughput": "5623.50", "total_tokens": 8410576} +{"current_steps": 465, "total_steps": 1000, "loss": 1.1735, "learning_rate": 6.461858523613684e-05, "epoch": 5.78538102643857, "percentage": 46.5, "cur_time": "2024-08-20 09:00:59", "elapsed_time": "0:25:05", "remaining_time": "0:28:51", "throughput": "5626.01", "total_tokens": 8468016} +{"current_steps": 468, "total_steps": 1000, "loss": 1.1583, "learning_rate": 6.411707284214384e-05, "epoch": 5.822706065318818, "percentage": 46.8, "cur_time": "2024-08-20 09:01:08", "elapsed_time": "0:25:13", "remaining_time": "0:28:40", "throughput": "5627.10", "total_tokens": 8518768} +{"current_steps": 471, "total_steps": 1000, "loss": 1.1513, "learning_rate": 6.361401235202872e-05, "epoch": 5.8600311041990665, "percentage": 47.1, "cur_time": "2024-08-20 09:01:17", "elapsed_time": "0:25:23", "remaining_time": "0:28:30", "throughput": "5628.53", "total_tokens": 8573872} +{"current_steps": 474, "total_steps": 1000, "loss": 1.06, "learning_rate": 6.310945893204324e-05, "epoch": 5.897356143079316, "percentage": 47.4, "cur_time": "2024-08-20 09:01:27", "elapsed_time": "0:25:32", "remaining_time": "0:28:21", "throughput": "5629.10", "total_tokens": 8628928} +{"current_steps": 477, "total_steps": 1000, "loss": 1.1553, "learning_rate": 6.26034679121557e-05, "epoch": 5.934681181959564, "percentage": 47.7, "cur_time": "2024-08-20 09:01:36", "elapsed_time": "0:25:42", "remaining_time": "0:28:10", "throughput": "5629.51", "total_tokens": 8681104} +{"current_steps": 480, "total_steps": 1000, "loss": 1.1209, "learning_rate": 6.209609477998338e-05, "epoch": 5.9720062208398135, "percentage": 48.0, "cur_time": "2024-08-20 09:01:46", "elapsed_time": "0:25:51", "remaining_time": "0:28:01", "throughput": "5629.32", "total_tokens": 8735984} +{"current_steps": 483, "total_steps": 1000, "loss": 1.1008, "learning_rate": 6.158739517470786e-05, "epoch": 6.009331259720062, "percentage": 48.3, "cur_time": "2024-08-20 09:01:55", "elapsed_time": "0:26:01", "remaining_time": "0:27:51", "throughput": "5630.49", "total_tokens": 8790416} +{"current_steps": 486, "total_steps": 1000, "loss": 1.0679, "learning_rate": 6.107742488097338e-05, "epoch": 6.046656298600311, "percentage": 48.6, "cur_time": "2024-08-20 09:02:06", "elapsed_time": "0:26:11", "remaining_time": "0:27:41", "throughput": "5631.02", "total_tokens": 8848464} +{"current_steps": 489, "total_steps": 1000, "loss": 1.0223, "learning_rate": 6.056623982276944e-05, "epoch": 6.08398133748056, "percentage": 48.9, "cur_time": "2024-08-20 09:02:14", "elapsed_time": "0:26:20", "remaining_time": "0:27:31", "throughput": "5632.16", "total_tokens": 8900736} +{"current_steps": 492, "total_steps": 1000, "loss": 1.0493, "learning_rate": 6.005389605729824e-05, "epoch": 6.121306376360809, "percentage": 49.2, "cur_time": "2024-08-20 09:02:24", "elapsed_time": "0:26:30", "remaining_time": "0:27:21", "throughput": "5632.82", "total_tokens": 8957616} +{"current_steps": 495, "total_steps": 1000, "loss": 1.0017, "learning_rate": 5.9540449768827246e-05, "epoch": 6.158631415241057, "percentage": 49.5, "cur_time": "2024-08-20 09:02:34", "elapsed_time": "0:26:39", "remaining_time": "0:27:12", "throughput": "5634.26", "total_tokens": 9013680} +{"current_steps": 498, "total_steps": 1000, "loss": 1.0215, "learning_rate": 5.902595726252801e-05, "epoch": 6.195956454121307, "percentage": 49.8, "cur_time": "2024-08-20 09:02:44", "elapsed_time": "0:26:50", "remaining_time": "0:27:02", "throughput": "5633.06", "total_tokens": 9069536} +{"current_steps": 500, "total_steps": 1000, "eval_loss": 1.6311813592910767, "epoch": 6.2208398133748055, "percentage": 50.0, "cur_time": "2024-08-20 09:03:01", "elapsed_time": "0:27:07", "remaining_time": "0:27:07", "throughput": "5598.75", "total_tokens": 9110112} +{"current_steps": 501, "total_steps": 1000, "loss": 1.0748, "learning_rate": 5.851047495830163e-05, "epoch": 6.233281493001555, "percentage": 50.1, "cur_time": "2024-08-20 09:03:04", "elapsed_time": "0:27:10", "remaining_time": "0:27:03", "throughput": "5598.77", "total_tokens": 9127472} +{"current_steps": 504, "total_steps": 1000, "loss": 0.9791, "learning_rate": 5.799405938459175e-05, "epoch": 6.270606531881804, "percentage": 50.4, "cur_time": "2024-08-20 09:03:13", "elapsed_time": "0:27:19", "remaining_time": "0:26:53", "throughput": "5598.44", "total_tokens": 9176608} +{"current_steps": 507, "total_steps": 1000, "loss": 1.0378, "learning_rate": 5.747676717218549e-05, "epoch": 6.307931570762053, "percentage": 50.7, "cur_time": "2024-08-20 09:03:22", "elapsed_time": "0:27:28", "remaining_time": "0:26:42", "throughput": "5598.76", "total_tokens": 9228576} +{"current_steps": 510, "total_steps": 1000, "loss": 1.0241, "learning_rate": 5.695865504800327e-05, "epoch": 6.345256609642302, "percentage": 51.0, "cur_time": "2024-08-20 09:03:33", "elapsed_time": "0:27:38", "remaining_time": "0:26:33", "throughput": "5599.28", "total_tokens": 9286272} +{"current_steps": 513, "total_steps": 1000, "loss": 1.0465, "learning_rate": 5.643977982887815e-05, "epoch": 6.38258164852255, "percentage": 51.3, "cur_time": "2024-08-20 09:03:43", "elapsed_time": "0:27:48", "remaining_time": "0:26:23", "throughput": "5599.53", "total_tokens": 9342096} +{"current_steps": 516, "total_steps": 1000, "loss": 1.0456, "learning_rate": 5.5920198415325064e-05, "epoch": 6.4199066874028, "percentage": 51.6, "cur_time": "2024-08-20 09:03:52", "elapsed_time": "0:27:58", "remaining_time": "0:26:14", "throughput": "5599.20", "total_tokens": 9396240} +{"current_steps": 519, "total_steps": 1000, "loss": 1.0381, "learning_rate": 5.539996778530115e-05, "epoch": 6.457231726283048, "percentage": 51.9, "cur_time": "2024-08-20 09:04:02", "elapsed_time": "0:28:08", "remaining_time": "0:26:04", "throughput": "5598.87", "total_tokens": 9451488} +{"current_steps": 522, "total_steps": 1000, "loss": 1.1424, "learning_rate": 5.487914498795747e-05, "epoch": 6.494556765163297, "percentage": 52.2, "cur_time": "2024-08-20 09:04:12", "elapsed_time": "0:28:18", "remaining_time": "0:25:54", "throughput": "5598.49", "total_tokens": 9506784} +{"current_steps": 525, "total_steps": 1000, "loss": 1.0076, "learning_rate": 5.435778713738292e-05, "epoch": 6.531881804043546, "percentage": 52.5, "cur_time": "2024-08-20 09:04:23", "elapsed_time": "0:28:28", "remaining_time": "0:25:45", "throughput": "5596.36", "total_tokens": 9562448} +{"current_steps": 528, "total_steps": 1000, "loss": 1.0462, "learning_rate": 5.383595140634093e-05, "epoch": 6.569206842923795, "percentage": 52.8, "cur_time": "2024-08-20 09:04:32", "elapsed_time": "0:28:37", "remaining_time": "0:25:35", "throughput": "5597.36", "total_tokens": 9615664} +{"current_steps": 531, "total_steps": 1000, "loss": 1.0512, "learning_rate": 5.3313695020000024e-05, "epoch": 6.6065318818040435, "percentage": 53.1, "cur_time": "2024-08-20 09:04:42", "elapsed_time": "0:28:48", "remaining_time": "0:25:26", "throughput": "5596.84", "total_tokens": 9671744} +{"current_steps": 534, "total_steps": 1000, "loss": 0.9775, "learning_rate": 5.279107524965819e-05, "epoch": 6.643856920684293, "percentage": 53.4, "cur_time": "2024-08-20 09:04:51", "elapsed_time": "0:28:57", "remaining_time": "0:25:16", "throughput": "5597.41", "total_tokens": 9724048} +{"current_steps": 537, "total_steps": 1000, "loss": 1.0258, "learning_rate": 5.226814940646269e-05, "epoch": 6.681181959564541, "percentage": 53.7, "cur_time": "2024-08-20 09:05:01", "elapsed_time": "0:29:06", "remaining_time": "0:25:06", "throughput": "5596.48", "total_tokens": 9776176} +{"current_steps": 540, "total_steps": 1000, "loss": 1.0739, "learning_rate": 5.174497483512506e-05, "epoch": 6.71850699844479, "percentage": 54.0, "cur_time": "2024-08-20 09:05:10", "elapsed_time": "0:29:16", "remaining_time": "0:24:56", "throughput": "5598.18", "total_tokens": 9832192} +{"current_steps": 543, "total_steps": 1000, "loss": 1.0348, "learning_rate": 5.1221608907632665e-05, "epoch": 6.755832037325039, "percentage": 54.3, "cur_time": "2024-08-20 09:05:20", "elapsed_time": "0:29:25", "remaining_time": "0:24:46", "throughput": "5599.06", "total_tokens": 9886048} +{"current_steps": 546, "total_steps": 1000, "loss": 1.0928, "learning_rate": 5.0698109016957274e-05, "epoch": 6.793157076205288, "percentage": 54.6, "cur_time": "2024-08-20 09:05:29", "elapsed_time": "0:29:34", "remaining_time": "0:24:35", "throughput": "5599.98", "total_tokens": 9939184} +{"current_steps": 549, "total_steps": 1000, "loss": 1.004, "learning_rate": 5.017453257076119e-05, "epoch": 6.830482115085537, "percentage": 54.9, "cur_time": "2024-08-20 09:05:39", "elapsed_time": "0:29:44", "remaining_time": "0:24:25", "throughput": "5599.72", "total_tokens": 9992272} +{"current_steps": 552, "total_steps": 1000, "loss": 0.9519, "learning_rate": 4.965093698510193e-05, "epoch": 6.867807153965785, "percentage": 55.2, "cur_time": "2024-08-20 09:05:48", "elapsed_time": "0:29:53", "remaining_time": "0:24:15", "throughput": "5599.81", "total_tokens": 10045168} +{"current_steps": 555, "total_steps": 1000, "loss": 1.1466, "learning_rate": 4.912737967813583e-05, "epoch": 6.905132192846034, "percentage": 55.5, "cur_time": "2024-08-20 09:05:57", "elapsed_time": "0:30:03", "remaining_time": "0:24:05", "throughput": "5602.25", "total_tokens": 10102720} +{"current_steps": 558, "total_steps": 1000, "loss": 1.0908, "learning_rate": 4.860391806382157e-05, "epoch": 6.942457231726283, "percentage": 55.8, "cur_time": "2024-08-20 09:06:07", "elapsed_time": "0:30:12", "remaining_time": "0:23:55", "throughput": "5602.68", "total_tokens": 10156832} +{"current_steps": 561, "total_steps": 1000, "loss": 0.9568, "learning_rate": 4.8080609545624004e-05, "epoch": 6.979782270606532, "percentage": 56.1, "cur_time": "2024-08-20 09:06:17", "elapsed_time": "0:30:22", "remaining_time": "0:23:46", "throughput": "5602.57", "total_tokens": 10210208} +{"current_steps": 564, "total_steps": 1000, "loss": 1.0496, "learning_rate": 4.755751151021934e-05, "epoch": 7.01710730948678, "percentage": 56.4, "cur_time": "2024-08-20 09:06:27", "elapsed_time": "0:30:32", "remaining_time": "0:23:36", "throughput": "5601.87", "total_tokens": 10267456} +{"current_steps": 567, "total_steps": 1000, "loss": 0.8353, "learning_rate": 4.703468132120193e-05, "epoch": 7.05443234836703, "percentage": 56.7, "cur_time": "2024-08-20 09:06:36", "elapsed_time": "0:30:42", "remaining_time": "0:23:26", "throughput": "5600.17", "total_tokens": 10317312} +{"current_steps": 570, "total_steps": 1000, "loss": 0.9929, "learning_rate": 4.6512176312793736e-05, "epoch": 7.091757387247278, "percentage": 57.0, "cur_time": "2024-08-20 09:06:46", "elapsed_time": "0:30:52", "remaining_time": "0:23:17", "throughput": "5600.74", "total_tokens": 10373232} +{"current_steps": 573, "total_steps": 1000, "loss": 1.0196, "learning_rate": 4.599005378355706e-05, "epoch": 7.129082426127527, "percentage": 57.3, "cur_time": "2024-08-20 09:06:56", "elapsed_time": "0:31:01", "remaining_time": "0:23:07", "throughput": "5602.81", "total_tokens": 10429936} +{"current_steps": 576, "total_steps": 1000, "loss": 0.9094, "learning_rate": 4.5468370990111006e-05, "epoch": 7.166407465007776, "percentage": 57.6, "cur_time": "2024-08-20 09:07:06", "elapsed_time": "0:31:11", "remaining_time": "0:22:57", "throughput": "5602.05", "total_tokens": 10485824} +{"current_steps": 579, "total_steps": 1000, "loss": 0.9206, "learning_rate": 4.494718514085268e-05, "epoch": 7.203732503888025, "percentage": 57.9, "cur_time": "2024-08-20 09:07:15", "elapsed_time": "0:31:21", "remaining_time": "0:22:47", "throughput": "5603.31", "total_tokens": 10540736} +{"current_steps": 582, "total_steps": 1000, "loss": 1.0173, "learning_rate": 4.442655338968373e-05, "epoch": 7.2410575427682735, "percentage": 58.2, "cur_time": "2024-08-20 09:07:25", "elapsed_time": "0:31:30", "remaining_time": "0:22:38", "throughput": "5603.60", "total_tokens": 10596192} +{"current_steps": 585, "total_steps": 1000, "loss": 1.0234, "learning_rate": 4.390653282974264e-05, "epoch": 7.278382581648523, "percentage": 58.5, "cur_time": "2024-08-20 09:07:34", "elapsed_time": "0:31:40", "remaining_time": "0:22:28", "throughput": "5604.97", "total_tokens": 10651376} +{"current_steps": 588, "total_steps": 1000, "loss": 0.9771, "learning_rate": 4.3387180487143876e-05, "epoch": 7.315707620528771, "percentage": 58.8, "cur_time": "2024-08-20 09:07:43", "elapsed_time": "0:31:49", "remaining_time": "0:22:17", "throughput": "5606.76", "total_tokens": 10704512} +{"current_steps": 591, "total_steps": 1000, "loss": 0.8928, "learning_rate": 4.2868553314724425e-05, "epoch": 7.3530326594090205, "percentage": 59.1, "cur_time": "2024-08-20 09:07:53", "elapsed_time": "0:31:59", "remaining_time": "0:22:08", "throughput": "5605.79", "total_tokens": 10758656} +{"current_steps": 594, "total_steps": 1000, "loss": 1.0248, "learning_rate": 4.23507081857981e-05, "epoch": 7.390357698289269, "percentage": 59.4, "cur_time": "2024-08-20 09:08:03", "elapsed_time": "0:32:08", "remaining_time": "0:21:58", "throughput": "5606.26", "total_tokens": 10814112} +{"current_steps": 597, "total_steps": 1000, "loss": 0.8644, "learning_rate": 4.1833701887918904e-05, "epoch": 7.427682737169518, "percentage": 59.7, "cur_time": "2024-08-20 09:08:13", "elapsed_time": "0:32:18", "remaining_time": "0:21:48", "throughput": "5605.74", "total_tokens": 10869264} +{"current_steps": 600, "total_steps": 1000, "loss": 0.9704, "learning_rate": 4.131759111665349e-05, "epoch": 7.465007776049767, "percentage": 60.0, "cur_time": "2024-08-20 09:08:23", "elapsed_time": "0:32:28", "remaining_time": "0:21:39", "throughput": "5605.66", "total_tokens": 10922736} +{"current_steps": 603, "total_steps": 1000, "loss": 1.0462, "learning_rate": 4.080243246936399e-05, "epoch": 7.502332814930016, "percentage": 60.3, "cur_time": "2024-08-20 09:08:32", "elapsed_time": "0:32:38", "remaining_time": "0:21:29", "throughput": "5606.95", "total_tokens": 10979712} +{"current_steps": 606, "total_steps": 1000, "loss": 0.9357, "learning_rate": 4.028828243900141e-05, "epoch": 7.539657853810264, "percentage": 60.6, "cur_time": "2024-08-20 09:08:43", "elapsed_time": "0:32:48", "remaining_time": "0:21:20", "throughput": "5605.80", "total_tokens": 11036672} +{"current_steps": 609, "total_steps": 1000, "loss": 0.9906, "learning_rate": 3.9775197407910485e-05, "epoch": 7.576982892690513, "percentage": 60.9, "cur_time": "2024-08-20 09:08:53", "elapsed_time": "0:32:58", "remaining_time": "0:21:10", "throughput": "5606.24", "total_tokens": 11092496} +{"current_steps": 612, "total_steps": 1000, "loss": 0.9157, "learning_rate": 3.926323364164684e-05, "epoch": 7.614307931570762, "percentage": 61.2, "cur_time": "2024-08-20 09:09:01", "elapsed_time": "0:33:07", "remaining_time": "0:20:59", "throughput": "5607.21", "total_tokens": 11143456} +{"current_steps": 615, "total_steps": 1000, "loss": 0.9151, "learning_rate": 3.875244728280676e-05, "epoch": 7.651632970451011, "percentage": 61.5, "cur_time": "2024-08-20 09:09:12", "elapsed_time": "0:33:17", "remaining_time": "0:20:50", "throughput": "5606.25", "total_tokens": 11200064} +{"current_steps": 618, "total_steps": 1000, "loss": 1.0072, "learning_rate": 3.82428943448705e-05, "epoch": 7.68895800933126, "percentage": 61.8, "cur_time": "2024-08-20 09:09:22", "elapsed_time": "0:33:27", "remaining_time": "0:20:40", "throughput": "5606.94", "total_tokens": 11256752} +{"current_steps": 621, "total_steps": 1000, "loss": 0.9138, "learning_rate": 3.773463070605987e-05, "epoch": 7.726283048211508, "percentage": 62.1, "cur_time": "2024-08-20 09:09:31", "elapsed_time": "0:33:37", "remaining_time": "0:20:31", "throughput": "5607.56", "total_tokens": 11311200} +{"current_steps": 624, "total_steps": 1000, "loss": 0.9126, "learning_rate": 3.7227712103210486e-05, "epoch": 7.763608087091757, "percentage": 62.4, "cur_time": "2024-08-20 09:09:40", "elapsed_time": "0:33:46", "remaining_time": "0:20:20", "throughput": "5608.29", "total_tokens": 11362640} +{"current_steps": 627, "total_steps": 1000, "loss": 0.921, "learning_rate": 3.6722194125659556e-05, "epoch": 7.800933125972006, "percentage": 62.7, "cur_time": "2024-08-20 09:09:50", "elapsed_time": "0:33:56", "remaining_time": "0:20:11", "throughput": "5608.44", "total_tokens": 11420080} +{"current_steps": 630, "total_steps": 1000, "loss": 0.9167, "learning_rate": 3.6218132209150045e-05, "epoch": 7.838258164852255, "percentage": 63.0, "cur_time": "2024-08-20 09:09:59", "elapsed_time": "0:34:05", "remaining_time": "0:20:01", "throughput": "5609.00", "total_tokens": 11471056} +{"current_steps": 633, "total_steps": 1000, "loss": 1.0907, "learning_rate": 3.5715581629751326e-05, "epoch": 7.8755832037325035, "percentage": 63.3, "cur_time": "2024-08-20 09:10:09", "elapsed_time": "0:34:14", "remaining_time": "0:19:51", "throughput": "5610.00", "total_tokens": 11526928} +{"current_steps": 636, "total_steps": 1000, "loss": 0.9222, "learning_rate": 3.5214597497797684e-05, "epoch": 7.912908242612753, "percentage": 63.6, "cur_time": "2024-08-20 09:10:18", "elapsed_time": "0:34:24", "remaining_time": "0:19:41", "throughput": "5610.42", "total_tokens": 11580928} +{"current_steps": 639, "total_steps": 1000, "loss": 0.9267, "learning_rate": 3.471523475184472e-05, "epoch": 7.950233281493001, "percentage": 63.9, "cur_time": "2024-08-20 09:10:28", "elapsed_time": "0:34:33", "remaining_time": "0:19:31", "throughput": "5610.29", "total_tokens": 11634416} +{"current_steps": 642, "total_steps": 1000, "loss": 0.9974, "learning_rate": 3.4217548152644885e-05, "epoch": 7.9875583203732505, "percentage": 64.2, "cur_time": "2024-08-20 09:10:37", "elapsed_time": "0:34:42", "remaining_time": "0:19:21", "throughput": "5611.85", "total_tokens": 11688928} +{"current_steps": 645, "total_steps": 1000, "loss": 0.924, "learning_rate": 3.372159227714218e-05, "epoch": 8.024883359253499, "percentage": 64.5, "cur_time": "2024-08-20 09:10:47", "elapsed_time": "0:34:52", "remaining_time": "0:19:11", "throughput": "5611.58", "total_tokens": 11741968} +{"current_steps": 648, "total_steps": 1000, "loss": 0.7697, "learning_rate": 3.322742151248725e-05, "epoch": 8.062208398133748, "percentage": 64.8, "cur_time": "2024-08-20 09:10:56", "elapsed_time": "0:35:02", "remaining_time": "0:19:01", "throughput": "5610.78", "total_tokens": 11794432} +{"current_steps": 651, "total_steps": 1000, "loss": 0.9866, "learning_rate": 3.273509005007327e-05, "epoch": 8.099533437013998, "percentage": 65.1, "cur_time": "2024-08-20 09:11:05", "elapsed_time": "0:35:11", "remaining_time": "0:18:51", "throughput": "5612.54", "total_tokens": 11849744} +{"current_steps": 654, "total_steps": 1000, "loss": 0.9879, "learning_rate": 3.224465187959316e-05, "epoch": 8.136858475894245, "percentage": 65.4, "cur_time": "2024-08-20 09:11:14", "elapsed_time": "0:35:20", "remaining_time": "0:18:41", "throughput": "5614.54", "total_tokens": 11904800} +{"current_steps": 657, "total_steps": 1000, "loss": 0.8112, "learning_rate": 3.1756160783119016e-05, "epoch": 8.174183514774494, "percentage": 65.7, "cur_time": "2024-08-20 09:11:25", "elapsed_time": "0:35:30", "remaining_time": "0:18:32", "throughput": "5613.83", "total_tokens": 11960208} +{"current_steps": 660, "total_steps": 1000, "loss": 0.8722, "learning_rate": 3.12696703292044e-05, "epoch": 8.211508553654744, "percentage": 66.0, "cur_time": "2024-08-20 09:11:34", "elapsed_time": "0:35:39", "remaining_time": "0:18:22", "throughput": "5614.84", "total_tokens": 12012304} +{"current_steps": 663, "total_steps": 1000, "loss": 0.9897, "learning_rate": 3.078523386700982e-05, "epoch": 8.248833592534993, "percentage": 66.3, "cur_time": "2024-08-20 09:11:44", "elapsed_time": "0:35:49", "remaining_time": "0:18:12", "throughput": "5614.09", "total_tokens": 12067760} +{"current_steps": 666, "total_steps": 1000, "loss": 0.9071, "learning_rate": 3.0302904520452447e-05, "epoch": 8.28615863141524, "percentage": 66.6, "cur_time": "2024-08-20 09:11:55", "elapsed_time": "0:36:00", "remaining_time": "0:18:03", "throughput": "5612.96", "total_tokens": 12127248} +{"current_steps": 669, "total_steps": 1000, "loss": 0.8776, "learning_rate": 2.9822735182380496e-05, "epoch": 8.32348367029549, "percentage": 66.9, "cur_time": "2024-08-20 09:12:04", "elapsed_time": "0:36:10", "remaining_time": "0:17:53", "throughput": "5613.69", "total_tokens": 12183520} +{"current_steps": 672, "total_steps": 1000, "loss": 0.9587, "learning_rate": 2.934477850877292e-05, "epoch": 8.360808709175739, "percentage": 67.2, "cur_time": "2024-08-20 09:12:15", "elapsed_time": "0:36:20", "remaining_time": "0:17:44", "throughput": "5613.93", "total_tokens": 12240960} +{"current_steps": 675, "total_steps": 1000, "loss": 0.8807, "learning_rate": 2.886908691296504e-05, "epoch": 8.398133748055988, "percentage": 67.5, "cur_time": "2024-08-20 09:12:24", "elapsed_time": "0:36:29", "remaining_time": "0:17:34", "throughput": "5614.62", "total_tokens": 12295600} +{"current_steps": 678, "total_steps": 1000, "loss": 0.8717, "learning_rate": 2.8395712559900877e-05, "epoch": 8.435458786936236, "percentage": 67.8, "cur_time": "2024-08-20 09:12:33", "elapsed_time": "0:36:39", "remaining_time": "0:17:24", "throughput": "5615.19", "total_tokens": 12349072} +{"current_steps": 681, "total_steps": 1000, "loss": 0.8578, "learning_rate": 2.7924707360412746e-05, "epoch": 8.472783825816485, "percentage": 68.1, "cur_time": "2024-08-20 09:12:43", "elapsed_time": "0:36:49", "remaining_time": "0:17:14", "throughput": "5614.01", "total_tokens": 12403360} +{"current_steps": 684, "total_steps": 1000, "loss": 0.9193, "learning_rate": 2.7456122965528475e-05, "epoch": 8.510108864696734, "percentage": 68.4, "cur_time": "2024-08-20 09:12:54", "elapsed_time": "0:37:00", "remaining_time": "0:17:05", "throughput": "5613.17", "total_tokens": 12462624} +{"current_steps": 687, "total_steps": 1000, "loss": 0.909, "learning_rate": 2.699001076080742e-05, "epoch": 8.547433903576984, "percentage": 68.7, "cur_time": "2024-08-20 09:13:05", "elapsed_time": "0:37:10", "remaining_time": "0:16:56", "throughput": "5612.53", "total_tokens": 12519456} +{"current_steps": 690, "total_steps": 1000, "loss": 0.9335, "learning_rate": 2.6526421860705473e-05, "epoch": 8.584758942457231, "percentage": 69.0, "cur_time": "2024-08-20 09:13:14", "elapsed_time": "0:37:20", "remaining_time": "0:16:46", "throughput": "5612.90", "total_tokens": 12573888} +{"current_steps": 693, "total_steps": 1000, "loss": 0.8227, "learning_rate": 2.6065407102969664e-05, "epoch": 8.62208398133748, "percentage": 69.3, "cur_time": "2024-08-20 09:13:24", "elapsed_time": "0:37:29", "remaining_time": "0:16:36", "throughput": "5613.71", "total_tokens": 12627520} +{"current_steps": 696, "total_steps": 1000, "loss": 0.8731, "learning_rate": 2.560701704306336e-05, "epoch": 8.65940902021773, "percentage": 69.6, "cur_time": "2024-08-20 09:13:33", "elapsed_time": "0:37:39", "remaining_time": "0:16:26", "throughput": "5612.96", "total_tokens": 12680768} +{"current_steps": 699, "total_steps": 1000, "loss": 0.8538, "learning_rate": 2.5151301948622237e-05, "epoch": 8.696734059097977, "percentage": 69.9, "cur_time": "2024-08-20 09:13:43", "elapsed_time": "0:37:48", "remaining_time": "0:16:17", "throughput": "5612.78", "total_tokens": 12734640} +{"current_steps": 702, "total_steps": 1000, "loss": 0.8654, "learning_rate": 2.469831179394182e-05, "epoch": 8.734059097978227, "percentage": 70.2, "cur_time": "2024-08-20 09:13:52", "elapsed_time": "0:37:57", "remaining_time": "0:16:06", "throughput": "5614.83", "total_tokens": 12787200} +{"current_steps": 705, "total_steps": 1000, "loss": 0.8738, "learning_rate": 2.4248096254497288e-05, "epoch": 8.771384136858476, "percentage": 70.5, "cur_time": "2024-08-20 09:14:01", "elapsed_time": "0:38:06", "remaining_time": "0:15:56", "throughput": "5614.60", "total_tokens": 12840064} +{"current_steps": 708, "total_steps": 1000, "loss": 0.9721, "learning_rate": 2.3800704701496053e-05, "epoch": 8.808709175738725, "percentage": 70.8, "cur_time": "2024-08-20 09:14:11", "elapsed_time": "0:38:16", "remaining_time": "0:15:47", "throughput": "5614.58", "total_tokens": 12895776} +{"current_steps": 711, "total_steps": 1000, "loss": 0.9084, "learning_rate": 2.33561861964635e-05, "epoch": 8.846034214618973, "percentage": 71.1, "cur_time": "2024-08-20 09:14:20", "elapsed_time": "0:38:26", "remaining_time": "0:15:37", "throughput": "5613.98", "total_tokens": 12946496} +{"current_steps": 714, "total_steps": 1000, "loss": 0.8723, "learning_rate": 2.2914589485863014e-05, "epoch": 8.883359253499222, "percentage": 71.4, "cur_time": "2024-08-20 09:14:30", "elapsed_time": "0:38:35", "remaining_time": "0:15:27", "throughput": "5613.59", "total_tokens": 12999616} +{"current_steps": 717, "total_steps": 1000, "loss": 0.8646, "learning_rate": 2.247596299575022e-05, "epoch": 8.920684292379471, "percentage": 71.7, "cur_time": "2024-08-20 09:14:39", "elapsed_time": "0:38:45", "remaining_time": "0:15:17", "throughput": "5614.00", "total_tokens": 13053840} +{"current_steps": 720, "total_steps": 1000, "loss": 0.869, "learning_rate": 2.2040354826462668e-05, "epoch": 8.95800933125972, "percentage": 72.0, "cur_time": "2024-08-20 09:14:49", "elapsed_time": "0:38:54", "remaining_time": "0:15:08", "throughput": "5613.33", "total_tokens": 13107104} +{"current_steps": 723, "total_steps": 1000, "loss": 0.8733, "learning_rate": 2.160781274734495e-05, "epoch": 8.995334370139968, "percentage": 72.3, "cur_time": "2024-08-20 09:14:59", "elapsed_time": "0:39:05", "remaining_time": "0:14:58", "throughput": "5613.00", "total_tokens": 13162592} +{"current_steps": 726, "total_steps": 1000, "loss": 0.9076, "learning_rate": 2.117838419151034e-05, "epoch": 9.032659409020217, "percentage": 72.6, "cur_time": "2024-08-20 09:15:09", "elapsed_time": "0:39:14", "remaining_time": "0:14:48", "throughput": "5613.28", "total_tokens": 13217488} +{"current_steps": 729, "total_steps": 1000, "loss": 0.828, "learning_rate": 2.0752116250639225e-05, "epoch": 9.069984447900467, "percentage": 72.9, "cur_time": "2024-08-20 09:15:19", "elapsed_time": "0:39:24", "remaining_time": "0:14:39", "throughput": "5612.00", "total_tokens": 13270704} +{"current_steps": 732, "total_steps": 1000, "loss": 0.8922, "learning_rate": 2.0329055669814934e-05, "epoch": 9.107309486780716, "percentage": 73.2, "cur_time": "2024-08-20 09:15:28", "elapsed_time": "0:39:33", "remaining_time": "0:14:29", "throughput": "5613.63", "total_tokens": 13325168} +{"current_steps": 735, "total_steps": 1000, "loss": 0.8684, "learning_rate": 1.9909248842397584e-05, "epoch": 9.144634525660964, "percentage": 73.5, "cur_time": "2024-08-20 09:15:39", "elapsed_time": "0:39:45", "remaining_time": "0:14:19", "throughput": "5612.25", "total_tokens": 13385488} +{"current_steps": 738, "total_steps": 1000, "loss": 0.8703, "learning_rate": 1.9492741804936622e-05, "epoch": 9.181959564541213, "percentage": 73.8, "cur_time": "2024-08-20 09:15:49", "elapsed_time": "0:39:54", "remaining_time": "0:14:10", "throughput": "5612.80", "total_tokens": 13439872} +{"current_steps": 741, "total_steps": 1000, "loss": 0.8381, "learning_rate": 1.9079580232122303e-05, "epoch": 9.219284603421462, "percentage": 74.1, "cur_time": "2024-08-20 09:15:59", "elapsed_time": "0:40:04", "remaining_time": "0:14:00", "throughput": "5612.27", "total_tokens": 13494080} +{"current_steps": 744, "total_steps": 1000, "loss": 0.9077, "learning_rate": 1.866980943177699e-05, "epoch": 9.256609642301711, "percentage": 74.4, "cur_time": "2024-08-20 09:16:09", "elapsed_time": "0:40:14", "remaining_time": "0:13:50", "throughput": "5612.62", "total_tokens": 13552208} +{"current_steps": 747, "total_steps": 1000, "loss": 0.8887, "learning_rate": 1.8263474339886628e-05, "epoch": 9.293934681181959, "percentage": 74.7, "cur_time": "2024-08-20 09:16:18", "elapsed_time": "0:40:24", "remaining_time": "0:13:41", "throughput": "5613.57", "total_tokens": 13608832} +{"current_steps": 750, "total_steps": 1000, "loss": 0.8205, "learning_rate": 1.7860619515673033e-05, "epoch": 9.331259720062208, "percentage": 75.0, "cur_time": "2024-08-20 09:16:28", "elapsed_time": "0:40:34", "remaining_time": "0:13:31", "throughput": "5612.97", "total_tokens": 13663760} +{"current_steps": 753, "total_steps": 1000, "loss": 0.8274, "learning_rate": 1.746128913670746e-05, "epoch": 9.368584758942458, "percentage": 75.3, "cur_time": "2024-08-20 09:16:38", "elapsed_time": "0:40:43", "remaining_time": "0:13:21", "throughput": "5612.73", "total_tokens": 13716800} +{"current_steps": 756, "total_steps": 1000, "loss": 0.9119, "learning_rate": 1.7065526994065973e-05, "epoch": 9.405909797822707, "percentage": 75.6, "cur_time": "2024-08-20 09:16:48", "elapsed_time": "0:40:53", "remaining_time": "0:13:12", "throughput": "5613.32", "total_tokens": 13774816} +{"current_steps": 759, "total_steps": 1000, "loss": 0.8099, "learning_rate": 1.667337648752738e-05, "epoch": 9.443234836702954, "percentage": 75.9, "cur_time": "2024-08-20 09:16:59", "elapsed_time": "0:41:04", "remaining_time": "0:13:02", "throughput": "5611.83", "total_tokens": 13830048} +{"current_steps": 762, "total_steps": 1000, "loss": 0.8317, "learning_rate": 1.6284880620813848e-05, "epoch": 9.480559875583204, "percentage": 76.2, "cur_time": "2024-08-20 09:17:08", "elapsed_time": "0:41:13", "remaining_time": "0:12:52", "throughput": "5613.58", "total_tokens": 13885008} +{"current_steps": 765, "total_steps": 1000, "loss": 0.7895, "learning_rate": 1.5900081996875083e-05, "epoch": 9.517884914463453, "percentage": 76.5, "cur_time": "2024-08-20 09:17:18", "elapsed_time": "0:41:23", "remaining_time": "0:12:42", "throughput": "5612.36", "total_tokens": 13939504} +{"current_steps": 768, "total_steps": 1000, "loss": 0.8474, "learning_rate": 1.551902281321651e-05, "epoch": 9.555209953343702, "percentage": 76.8, "cur_time": "2024-08-20 09:17:27", "elapsed_time": "0:41:32", "remaining_time": "0:12:33", "throughput": "5613.35", "total_tokens": 13992352} +{"current_steps": 771, "total_steps": 1000, "loss": 0.8418, "learning_rate": 1.5141744857271778e-05, "epoch": 9.59253499222395, "percentage": 77.1, "cur_time": "2024-08-20 09:17:37", "elapsed_time": "0:41:42", "remaining_time": "0:12:23", "throughput": "5612.15", "total_tokens": 14044656} +{"current_steps": 774, "total_steps": 1000, "loss": 0.7605, "learning_rate": 1.4768289501820265e-05, "epoch": 9.629860031104199, "percentage": 77.4, "cur_time": "2024-08-20 09:17:46", "elapsed_time": "0:41:51", "remaining_time": "0:12:13", "throughput": "5611.34", "total_tokens": 14095664} +{"current_steps": 777, "total_steps": 1000, "loss": 0.8928, "learning_rate": 1.439869770045018e-05, "epoch": 9.667185069984448, "percentage": 77.7, "cur_time": "2024-08-20 09:17:56", "elapsed_time": "0:42:01", "remaining_time": "0:12:03", "throughput": "5612.28", "total_tokens": 14151808} +{"current_steps": 780, "total_steps": 1000, "loss": 0.7745, "learning_rate": 1.4033009983067452e-05, "epoch": 9.704510108864696, "percentage": 78.0, "cur_time": "2024-08-20 09:18:05", "elapsed_time": "0:42:10", "remaining_time": "0:11:53", "throughput": "5612.54", "total_tokens": 14202128} +{"current_steps": 783, "total_steps": 1000, "loss": 0.8861, "learning_rate": 1.367126645145121e-05, "epoch": 9.741835147744945, "percentage": 78.3, "cur_time": "2024-08-20 09:18:13", "elapsed_time": "0:42:18", "remaining_time": "0:11:43", "throughput": "5614.17", "total_tokens": 14254320} +{"current_steps": 786, "total_steps": 1000, "loss": 0.7954, "learning_rate": 1.3313506774856177e-05, "epoch": 9.779160186625194, "percentage": 78.6, "cur_time": "2024-08-20 09:18:22", "elapsed_time": "0:42:27", "remaining_time": "0:11:33", "throughput": "5615.20", "total_tokens": 14307104} +{"current_steps": 789, "total_steps": 1000, "loss": 0.8982, "learning_rate": 1.29597701856625e-05, "epoch": 9.816485225505444, "percentage": 78.9, "cur_time": "2024-08-20 09:18:33", "elapsed_time": "0:42:38", "remaining_time": "0:11:24", "throughput": "5614.64", "total_tokens": 14364592} +{"current_steps": 792, "total_steps": 1000, "loss": 0.8608, "learning_rate": 1.2610095475073414e-05, "epoch": 9.853810264385691, "percentage": 79.2, "cur_time": "2024-08-20 09:18:42", "elapsed_time": "0:42:47", "remaining_time": "0:11:14", "throughput": "5614.99", "total_tokens": 14417696} +{"current_steps": 795, "total_steps": 1000, "loss": 0.8443, "learning_rate": 1.22645209888614e-05, "epoch": 9.89113530326594, "percentage": 79.5, "cur_time": "2024-08-20 09:18:52", "elapsed_time": "0:42:57", "remaining_time": "0:11:04", "throughput": "5615.03", "total_tokens": 14472720} +{"current_steps": 798, "total_steps": 1000, "loss": 0.8242, "learning_rate": 1.1923084623163172e-05, "epoch": 9.92846034214619, "percentage": 79.8, "cur_time": "2024-08-20 09:19:01", "elapsed_time": "0:43:06", "remaining_time": "0:10:54", "throughput": "5615.21", "total_tokens": 14526336} +{"current_steps": 801, "total_steps": 1000, "loss": 0.7819, "learning_rate": 1.1585823820323843e-05, "epoch": 9.96578538102644, "percentage": 80.1, "cur_time": "2024-08-20 09:19:10", "elapsed_time": "0:43:15", "remaining_time": "0:10:44", "throughput": "5615.41", "total_tokens": 14577424} +{"current_steps": 804, "total_steps": 1000, "loss": 0.8295, "learning_rate": 1.1252775564791024e-05, "epoch": 10.003110419906687, "percentage": 80.4, "cur_time": "2024-08-20 09:19:20", "elapsed_time": "0:43:26", "remaining_time": "0:10:35", "throughput": "5614.82", "total_tokens": 14633552} +{"current_steps": 807, "total_steps": 1000, "loss": 0.8614, "learning_rate": 1.0923976379059058e-05, "epoch": 10.040435458786936, "percentage": 80.7, "cur_time": "2024-08-20 09:19:31", "elapsed_time": "0:43:37", "remaining_time": "0:10:25", "throughput": "5613.17", "total_tokens": 14690976} +{"current_steps": 810, "total_steps": 1000, "loss": 0.7815, "learning_rate": 1.0599462319663905e-05, "epoch": 10.077760497667185, "percentage": 81.0, "cur_time": "2024-08-20 09:19:41", "elapsed_time": "0:43:46", "remaining_time": "0:10:16", "throughput": "5613.01", "total_tokens": 14742128} +{"current_steps": 813, "total_steps": 1000, "loss": 0.7795, "learning_rate": 1.0279268973229089e-05, "epoch": 10.115085536547435, "percentage": 81.3, "cur_time": "2024-08-20 09:19:50", "elapsed_time": "0:43:56", "remaining_time": "0:10:06", "throughput": "5612.05", "total_tokens": 14794288} +{"current_steps": 816, "total_steps": 1000, "loss": 0.7585, "learning_rate": 9.963431452563332e-06, "epoch": 10.152410575427682, "percentage": 81.6, "cur_time": "2024-08-20 09:20:01", "elapsed_time": "0:44:06", "remaining_time": "0:09:56", "throughput": "5609.70", "total_tokens": 14846560} +{"current_steps": 819, "total_steps": 1000, "loss": 0.7785, "learning_rate": 9.651984392809914e-06, "epoch": 10.189735614307931, "percentage": 81.9, "cur_time": "2024-08-20 09:20:10", "elapsed_time": "0:44:15", "remaining_time": "0:09:46", "throughput": "5610.24", "total_tokens": 14900528} +{"current_steps": 822, "total_steps": 1000, "loss": 0.848, "learning_rate": 9.344961947648623e-06, "epoch": 10.22706065318818, "percentage": 82.2, "cur_time": "2024-08-20 09:20:20", "elapsed_time": "0:44:25", "remaining_time": "0:09:37", "throughput": "5610.71", "total_tokens": 14956624} +{"current_steps": 825, "total_steps": 1000, "loss": 0.8245, "learning_rate": 9.042397785550405e-06, "epoch": 10.26438569206843, "percentage": 82.5, "cur_time": "2024-08-20 09:20:30", "elapsed_time": "0:44:35", "remaining_time": "0:09:27", "throughput": "5611.71", "total_tokens": 15013488} +{"current_steps": 828, "total_steps": 1000, "loss": 0.811, "learning_rate": 8.744325086085248e-06, "epoch": 10.301710730948678, "percentage": 82.8, "cur_time": "2024-08-20 09:20:39", "elapsed_time": "0:44:44", "remaining_time": "0:09:17", "throughput": "5612.85", "total_tokens": 15068000} +{"current_steps": 831, "total_steps": 1000, "loss": 0.8597, "learning_rate": 8.450776536283594e-06, "epoch": 10.339035769828927, "percentage": 83.1, "cur_time": "2024-08-20 09:20:49", "elapsed_time": "0:44:54", "remaining_time": "0:09:07", "throughput": "5613.05", "total_tokens": 15124672} +{"current_steps": 834, "total_steps": 1000, "loss": 0.7463, "learning_rate": 8.16178432705192e-06, "epoch": 10.376360808709176, "percentage": 83.4, "cur_time": "2024-08-20 09:20:58", "elapsed_time": "0:45:03", "remaining_time": "0:08:58", "throughput": "5613.26", "total_tokens": 15174960} +{"current_steps": 837, "total_steps": 1000, "loss": 0.7385, "learning_rate": 7.877380149642626e-06, "epoch": 10.413685847589425, "percentage": 83.7, "cur_time": "2024-08-20 09:21:07", "elapsed_time": "0:45:12", "remaining_time": "0:08:48", "throughput": "5613.65", "total_tokens": 15228192} +{"current_steps": 840, "total_steps": 1000, "loss": 0.8362, "learning_rate": 7.597595192178702e-06, "epoch": 10.451010886469673, "percentage": 84.0, "cur_time": "2024-08-20 09:21:16", "elapsed_time": "0:45:21", "remaining_time": "0:08:38", "throughput": "5614.64", "total_tokens": 15282256} +{"current_steps": 843, "total_steps": 1000, "loss": 0.7866, "learning_rate": 7.322460136233622e-06, "epoch": 10.488335925349922, "percentage": 84.3, "cur_time": "2024-08-20 09:21:25", "elapsed_time": "0:45:31", "remaining_time": "0:08:28", "throughput": "5615.42", "total_tokens": 15337392} +{"current_steps": 846, "total_steps": 1000, "loss": 0.8247, "learning_rate": 7.052005153466779e-06, "epoch": 10.525660964230172, "percentage": 84.6, "cur_time": "2024-08-20 09:21:35", "elapsed_time": "0:45:41", "remaining_time": "0:08:18", "throughput": "5615.48", "total_tokens": 15392448} +{"current_steps": 849, "total_steps": 1000, "loss": 0.8383, "learning_rate": 6.786259902314768e-06, "epoch": 10.56298600311042, "percentage": 84.9, "cur_time": "2024-08-20 09:21:45", "elapsed_time": "0:45:50", "remaining_time": "0:08:09", "throughput": "5615.23", "total_tokens": 15446672} +{"current_steps": 852, "total_steps": 1000, "loss": 0.7268, "learning_rate": 6.52525352473905e-06, "epoch": 10.600311041990668, "percentage": 85.2, "cur_time": "2024-08-20 09:21:54", "elapsed_time": "0:45:59", "remaining_time": "0:07:59", "throughput": "5614.53", "total_tokens": 15496064} +{"current_steps": 855, "total_steps": 1000, "loss": 0.8733, "learning_rate": 6.269014643030213e-06, "epoch": 10.637636080870918, "percentage": 85.5, "cur_time": "2024-08-20 09:22:04", "elapsed_time": "0:46:09", "remaining_time": "0:07:49", "throughput": "5614.61", "total_tokens": 15549952} +{"current_steps": 858, "total_steps": 1000, "loss": 0.8492, "learning_rate": 6.017571356669183e-06, "epoch": 10.674961119751167, "percentage": 85.8, "cur_time": "2024-08-20 09:22:14", "elapsed_time": "0:46:20", "remaining_time": "0:07:40", "throughput": "5614.02", "total_tokens": 15607920} +{"current_steps": 861, "total_steps": 1000, "loss": 0.8154, "learning_rate": 5.770951239245803e-06, "epoch": 10.712286158631414, "percentage": 86.1, "cur_time": "2024-08-20 09:22:24", "elapsed_time": "0:46:29", "remaining_time": "0:07:30", "throughput": "5615.07", "total_tokens": 15663280} +{"current_steps": 864, "total_steps": 1000, "loss": 0.7953, "learning_rate": 5.529181335435124e-06, "epoch": 10.749611197511664, "percentage": 86.4, "cur_time": "2024-08-20 09:22:33", "elapsed_time": "0:46:39", "remaining_time": "0:07:20", "throughput": "5615.35", "total_tokens": 15718496} +{"current_steps": 867, "total_steps": 1000, "loss": 0.8559, "learning_rate": 5.292288158031594e-06, "epoch": 10.786936236391913, "percentage": 86.7, "cur_time": "2024-08-20 09:22:43", "elapsed_time": "0:46:48", "remaining_time": "0:07:10", "throughput": "5615.58", "total_tokens": 15772560} +{"current_steps": 870, "total_steps": 1000, "loss": 0.8111, "learning_rate": 5.060297685041659e-06, "epoch": 10.824261275272162, "percentage": 87.0, "cur_time": "2024-08-20 09:22:52", "elapsed_time": "0:46:58", "remaining_time": "0:07:01", "throughput": "5616.41", "total_tokens": 15828464} +{"current_steps": 873, "total_steps": 1000, "loss": 0.8946, "learning_rate": 4.833235356834959e-06, "epoch": 10.86158631415241, "percentage": 87.3, "cur_time": "2024-08-20 09:23:03", "elapsed_time": "0:47:08", "remaining_time": "0:06:51", "throughput": "5615.82", "total_tokens": 15886160} +{"current_steps": 876, "total_steps": 1000, "loss": 0.831, "learning_rate": 4.611126073354571e-06, "epoch": 10.89891135303266, "percentage": 87.6, "cur_time": "2024-08-20 09:23:12", "elapsed_time": "0:47:17", "remaining_time": "0:06:41", "throughput": "5616.42", "total_tokens": 15938720} +{"current_steps": 879, "total_steps": 1000, "loss": 0.8318, "learning_rate": 4.3939941913863525e-06, "epoch": 10.936236391912908, "percentage": 87.9, "cur_time": "2024-08-20 09:23:21", "elapsed_time": "0:47:27", "remaining_time": "0:06:31", "throughput": "5617.15", "total_tokens": 15992720} +{"current_steps": 882, "total_steps": 1000, "loss": 0.7655, "learning_rate": 4.181863521888019e-06, "epoch": 10.973561430793158, "percentage": 88.2, "cur_time": "2024-08-20 09:23:31", "elapsed_time": "0:47:36", "remaining_time": "0:06:22", "throughput": "5617.96", "total_tokens": 16049584} +{"current_steps": 885, "total_steps": 1000, "loss": 0.8017, "learning_rate": 3.974757327377981e-06, "epoch": 11.010886469673405, "percentage": 88.5, "cur_time": "2024-08-20 09:23:40", "elapsed_time": "0:47:45", "remaining_time": "0:06:12", "throughput": "5619.03", "total_tokens": 16103840} +{"current_steps": 888, "total_steps": 1000, "loss": 0.8806, "learning_rate": 3.772698319384349e-06, "epoch": 11.048211508553655, "percentage": 88.8, "cur_time": "2024-08-20 09:23:50", "elapsed_time": "0:47:55", "remaining_time": "0:06:02", "throughput": "5619.95", "total_tokens": 16160880} +{"current_steps": 891, "total_steps": 1000, "loss": 0.7852, "learning_rate": 3.575708655954324e-06, "epoch": 11.085536547433904, "percentage": 89.1, "cur_time": "2024-08-20 09:23:59", "elapsed_time": "0:48:04", "remaining_time": "0:05:52", "throughput": "5620.44", "total_tokens": 16213280} +{"current_steps": 894, "total_steps": 1000, "loss": 0.7314, "learning_rate": 3.3838099392243916e-06, "epoch": 11.122861586314153, "percentage": 89.4, "cur_time": "2024-08-20 09:24:08", "elapsed_time": "0:48:13", "remaining_time": "0:05:43", "throughput": "5620.95", "total_tokens": 16265488} +{"current_steps": 897, "total_steps": 1000, "loss": 0.809, "learning_rate": 3.197023213051337e-06, "epoch": 11.1601866251944, "percentage": 89.7, "cur_time": "2024-08-20 09:24:17", "elapsed_time": "0:48:23", "remaining_time": "0:05:33", "throughput": "5621.58", "total_tokens": 16321040} +{"current_steps": 900, "total_steps": 1000, "loss": 0.7076, "learning_rate": 3.0153689607045845e-06, "epoch": 11.19751166407465, "percentage": 90.0, "cur_time": "2024-08-20 09:24:27", "elapsed_time": "0:48:32", "remaining_time": "0:05:23", "throughput": "5621.45", "total_tokens": 16372560} +{"current_steps": 903, "total_steps": 1000, "loss": 0.8021, "learning_rate": 2.8388671026199522e-06, "epoch": 11.2348367029549, "percentage": 90.3, "cur_time": "2024-08-20 09:24:36", "elapsed_time": "0:48:42", "remaining_time": "0:05:13", "throughput": "5621.44", "total_tokens": 16425856} +{"current_steps": 906, "total_steps": 1000, "loss": 0.7633, "learning_rate": 2.667536994215186e-06, "epoch": 11.272161741835149, "percentage": 90.6, "cur_time": "2024-08-20 09:24:46", "elapsed_time": "0:48:51", "remaining_time": "0:05:04", "throughput": "5621.79", "total_tokens": 16480176} +{"current_steps": 909, "total_steps": 1000, "loss": 0.7514, "learning_rate": 2.501397423767382e-06, "epoch": 11.309486780715396, "percentage": 90.9, "cur_time": "2024-08-20 09:24:54", "elapsed_time": "0:49:00", "remaining_time": "0:04:54", "throughput": "5622.41", "total_tokens": 16531248} +{"current_steps": 912, "total_steps": 1000, "loss": 0.9144, "learning_rate": 2.340466610352654e-06, "epoch": 11.346811819595645, "percentage": 91.2, "cur_time": "2024-08-20 09:25:05", "elapsed_time": "0:49:10", "remaining_time": "0:04:44", "throughput": "5622.95", "total_tokens": 16589952} +{"current_steps": 915, "total_steps": 1000, "loss": 0.7871, "learning_rate": 2.1847622018482283e-06, "epoch": 11.384136858475895, "percentage": 91.5, "cur_time": "2024-08-20 09:25:14", "elapsed_time": "0:49:19", "remaining_time": "0:04:34", "throughput": "5622.70", "total_tokens": 16642688} +{"current_steps": 918, "total_steps": 1000, "loss": 0.8018, "learning_rate": 2.0343012729971243e-06, "epoch": 11.421461897356144, "percentage": 91.8, "cur_time": "2024-08-20 09:25:24", "elapsed_time": "0:49:29", "remaining_time": "0:04:25", "throughput": "5622.95", "total_tokens": 16698016} +{"current_steps": 921, "total_steps": 1000, "loss": 0.7833, "learning_rate": 1.8891003235357308e-06, "epoch": 11.458786936236391, "percentage": 92.1, "cur_time": "2024-08-20 09:25:32", "elapsed_time": "0:49:38", "remaining_time": "0:04:15", "throughput": "5624.70", "total_tokens": 16751600} +{"current_steps": 924, "total_steps": 1000, "loss": 0.8172, "learning_rate": 1.7491752763844293e-06, "epoch": 11.49611197511664, "percentage": 92.4, "cur_time": "2024-08-20 09:25:42", "elapsed_time": "0:49:48", "remaining_time": "0:04:05", "throughput": "5624.79", "total_tokens": 16807424} +{"current_steps": 927, "total_steps": 1000, "loss": 0.7801, "learning_rate": 1.6145414759014431e-06, "epoch": 11.53343701399689, "percentage": 92.7, "cur_time": "2024-08-20 09:25:53", "elapsed_time": "0:49:59", "remaining_time": "0:03:56", "throughput": "5622.24", "total_tokens": 16863040} +{"current_steps": 930, "total_steps": 1000, "loss": 0.8497, "learning_rate": 1.4852136862001764e-06, "epoch": 11.57076205287714, "percentage": 93.0, "cur_time": "2024-08-20 09:26:04", "elapsed_time": "0:50:10", "remaining_time": "0:03:46", "throughput": "5620.82", "total_tokens": 16919056} +{"current_steps": 933, "total_steps": 1000, "loss": 0.865, "learning_rate": 1.3612060895301759e-06, "epoch": 11.608087091757387, "percentage": 93.3, "cur_time": "2024-08-20 09:26:15", "elapsed_time": "0:50:20", "remaining_time": "0:03:36", "throughput": "5620.69", "total_tokens": 16976880} +{"current_steps": 936, "total_steps": 1000, "loss": 0.6891, "learning_rate": 1.2425322847218368e-06, "epoch": 11.645412130637636, "percentage": 93.6, "cur_time": "2024-08-20 09:26:24", "elapsed_time": "0:50:29", "remaining_time": "0:03:27", "throughput": "5620.93", "total_tokens": 17028016} +{"current_steps": 939, "total_steps": 1000, "loss": 0.8017, "learning_rate": 1.1292052856952062e-06, "epoch": 11.682737169517885, "percentage": 93.9, "cur_time": "2024-08-20 09:26:34", "elapsed_time": "0:50:40", "remaining_time": "0:03:17", "throughput": "5619.72", "total_tokens": 17084832} +{"current_steps": 942, "total_steps": 1000, "loss": 0.8494, "learning_rate": 1.0212375200327973e-06, "epoch": 11.720062208398133, "percentage": 94.2, "cur_time": "2024-08-20 09:26:44", "elapsed_time": "0:50:49", "remaining_time": "0:03:07", "throughput": "5621.02", "total_tokens": 17141568} +{"current_steps": 945, "total_steps": 1000, "loss": 0.7922, "learning_rate": 9.186408276168013e-07, "epoch": 11.757387247278382, "percentage": 94.5, "cur_time": "2024-08-20 09:26:54", "elapsed_time": "0:51:00", "remaining_time": "0:02:58", "throughput": "5620.04", "total_tokens": 17197456} +{"current_steps": 948, "total_steps": 1000, "loss": 0.7013, "learning_rate": 8.214264593307098e-07, "epoch": 11.794712286158632, "percentage": 94.8, "cur_time": "2024-08-20 09:27:03", "elapsed_time": "0:51:08", "remaining_time": "0:02:48", "throughput": "5620.18", "total_tokens": 17246704} +{"current_steps": 951, "total_steps": 1000, "loss": 0.788, "learning_rate": 7.296050758254957e-07, "epoch": 11.83203732503888, "percentage": 95.1, "cur_time": "2024-08-20 09:27:12", "elapsed_time": "0:51:18", "remaining_time": "0:02:38", "throughput": "5620.45", "total_tokens": 17301040} +{"current_steps": 954, "total_steps": 1000, "loss": 0.7436, "learning_rate": 6.431867463506048e-07, "epoch": 11.869362363919128, "percentage": 95.4, "cur_time": "2024-08-20 09:27:22", "elapsed_time": "0:51:27", "remaining_time": "0:02:28", "throughput": "5620.65", "total_tokens": 17353760} +{"current_steps": 957, "total_steps": 1000, "loss": 0.783, "learning_rate": 5.621809476497098e-07, "epoch": 11.906687402799378, "percentage": 95.7, "cur_time": "2024-08-20 09:27:32", "elapsed_time": "0:51:37", "remaining_time": "0:02:19", "throughput": "5619.76", "total_tokens": 17408864} +{"current_steps": 960, "total_steps": 1000, "loss": 0.8306, "learning_rate": 4.865965629214819e-07, "epoch": 11.944012441679627, "percentage": 96.0, "cur_time": "2024-08-20 09:27:41", "elapsed_time": "0:51:47", "remaining_time": "0:02:09", "throughput": "5620.69", "total_tokens": 17463616} +{"current_steps": 963, "total_steps": 1000, "loss": 0.9427, "learning_rate": 4.1644188084548063e-07, "epoch": 11.981337480559876, "percentage": 96.3, "cur_time": "2024-08-20 09:27:51", "elapsed_time": "0:51:56", "remaining_time": "0:01:59", "throughput": "5621.67", "total_tokens": 17521440} +{"current_steps": 966, "total_steps": 1000, "loss": 0.8395, "learning_rate": 3.517245946731529e-07, "epoch": 12.018662519440124, "percentage": 96.6, "cur_time": "2024-08-20 09:28:01", "elapsed_time": "0:52:06", "remaining_time": "0:01:50", "throughput": "5622.25", "total_tokens": 17578672} +{"current_steps": 969, "total_steps": 1000, "loss": 0.8069, "learning_rate": 2.924518013842303e-07, "epoch": 12.055987558320373, "percentage": 96.9, "cur_time": "2024-08-20 09:28:11", "elapsed_time": "0:52:17", "remaining_time": "0:01:40", "throughput": "5621.59", "total_tokens": 17635856} +{"current_steps": 972, "total_steps": 1000, "loss": 0.7471, "learning_rate": 2.386300009084408e-07, "epoch": 12.093312597200622, "percentage": 97.2, "cur_time": "2024-08-20 09:28:20", "elapsed_time": "0:52:26", "remaining_time": "0:01:30", "throughput": "5622.15", "total_tokens": 17689072} +{"current_steps": 975, "total_steps": 1000, "loss": 0.8251, "learning_rate": 1.9026509541272275e-07, "epoch": 12.130637636080872, "percentage": 97.5, "cur_time": "2024-08-20 09:28:31", "elapsed_time": "0:52:36", "remaining_time": "0:01:20", "throughput": "5622.38", "total_tokens": 17746800} +{"current_steps": 978, "total_steps": 1000, "loss": 0.7807, "learning_rate": 1.4736238865398765e-07, "epoch": 12.16796267496112, "percentage": 97.8, "cur_time": "2024-08-20 09:28:39", "elapsed_time": "0:52:44", "remaining_time": "0:01:11", "throughput": "5623.98", "total_tokens": 17799264} +{"current_steps": 981, "total_steps": 1000, "loss": 0.8495, "learning_rate": 1.0992658539750178e-07, "epoch": 12.205287713841368, "percentage": 98.1, "cur_time": "2024-08-20 09:28:50", "elapsed_time": "0:52:55", "remaining_time": "0:01:01", "throughput": "5623.28", "total_tokens": 17857296} +{"current_steps": 984, "total_steps": 1000, "loss": 0.7786, "learning_rate": 7.796179090094891e-08, "epoch": 12.242612752721618, "percentage": 98.4, "cur_time": "2024-08-20 09:29:00", "elapsed_time": "0:53:05", "remaining_time": "0:00:51", "throughput": "5622.67", "total_tokens": 17911456} +{"current_steps": 987, "total_steps": 1000, "loss": 0.7214, "learning_rate": 5.1471510464268236e-08, "epoch": 12.279937791601867, "percentage": 98.7, "cur_time": "2024-08-20 09:29:08", "elapsed_time": "0:53:14", "remaining_time": "0:00:42", "throughput": "5623.24", "total_tokens": 17960800} +{"current_steps": 990, "total_steps": 1000, "loss": 0.7353, "learning_rate": 3.04586490452119e-08, "epoch": 12.317262830482115, "percentage": 99.0, "cur_time": "2024-08-20 09:29:18", "elapsed_time": "0:53:24", "remaining_time": "0:00:32", "throughput": "5622.00", "total_tokens": 18014816} +{"current_steps": 993, "total_steps": 1000, "loss": 0.7827, "learning_rate": 1.4925510940844156e-08, "epoch": 12.354587869362364, "percentage": 99.3, "cur_time": "2024-08-20 09:29:28", "elapsed_time": "0:53:33", "remaining_time": "0:00:22", "throughput": "5622.07", "total_tokens": 18069152} +{"current_steps": 996, "total_steps": 1000, "loss": 0.871, "learning_rate": 4.873799534788059e-09, "epoch": 12.391912908242613, "percentage": 99.6, "cur_time": "2024-08-20 09:29:38", "elapsed_time": "0:53:43", "remaining_time": "0:00:12", "throughput": "5622.08", "total_tokens": 18124176} +{"current_steps": 999, "total_steps": 1000, "loss": 0.8343, "learning_rate": 3.0461711048035415e-10, "epoch": 12.42923794712286, "percentage": 99.9, "cur_time": "2024-08-20 09:29:47", "elapsed_time": "0:53:53", "remaining_time": "0:00:03", "throughput": "5622.28", "total_tokens": 18177392} +{"current_steps": 1000, "total_steps": 1000, "eval_loss": 1.8893193006515503, "epoch": 12.441679626749611, "percentage": 100.0, "cur_time": "2024-08-20 09:30:00", "elapsed_time": "0:54:05", "remaining_time": "0:00:00", "throughput": "5605.26", "total_tokens": 18194112} +{"current_steps": 1000, "total_steps": 1000, "epoch": 12.441679626749611, "percentage": 100.0, "cur_time": "2024-08-20 09:30:00", "elapsed_time": "0:54:06", "remaining_time": "0:00:00", "throughput": "5604.93", "total_tokens": 18194112} diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/trainer_state.json b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/trainer_state.json new file mode 100644 index 00000000..3a294cd1 --- /dev/null +++ b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/trainer_state.json @@ -0,0 +1,2725 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 12.441679626749611, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03732503888024884, + "grad_norm": 0.45354562997817993, + "learning_rate": 3e-06, + "loss": 2.6491, + "num_input_tokens_seen": 59360, + "step": 3 + }, + { + "epoch": 0.07465007776049767, + "grad_norm": 0.492897093296051, + "learning_rate": 6e-06, + "loss": 2.714, + "num_input_tokens_seen": 110224, + "step": 6 + }, + { + "epoch": 0.1119751166407465, + "grad_norm": 0.4439088702201843, + "learning_rate": 9e-06, + "loss": 2.8832, + "num_input_tokens_seen": 162416, + "step": 9 + }, + { + "epoch": 0.14930015552099535, + "grad_norm": 0.4829657971858978, + "learning_rate": 1.2e-05, + "loss": 2.8293, + "num_input_tokens_seen": 215648, + "step": 12 + }, + { + "epoch": 0.18662519440124417, + "grad_norm": 0.5115702748298645, + "learning_rate": 1.5e-05, + "loss": 2.7404, + "num_input_tokens_seen": 270528, + "step": 15 + }, + { + "epoch": 0.223950233281493, + "grad_norm": 0.5357673168182373, + "learning_rate": 1.8e-05, + "loss": 2.7317, + "num_input_tokens_seen": 320944, + "step": 18 + }, + { + "epoch": 0.26127527216174184, + "grad_norm": 0.4877207577228546, + "learning_rate": 2.1e-05, + "loss": 2.7431, + "num_input_tokens_seen": 377696, + "step": 21 + }, + { + "epoch": 0.2986003110419907, + "grad_norm": 0.5688062310218811, + "learning_rate": 2.4e-05, + "loss": 2.5594, + "num_input_tokens_seen": 428048, + "step": 24 + }, + { + "epoch": 0.3359253499222395, + "grad_norm": 0.5852718949317932, + "learning_rate": 2.7000000000000002e-05, + "loss": 2.7774, + "num_input_tokens_seen": 485680, + "step": 27 + }, + { + "epoch": 0.37325038880248834, + "grad_norm": 0.626228928565979, + "learning_rate": 3e-05, + "loss": 2.7325, + "num_input_tokens_seen": 538704, + "step": 30 + }, + { + "epoch": 0.4105754276827372, + "grad_norm": 0.7254530787467957, + "learning_rate": 3.3e-05, + "loss": 2.6304, + "num_input_tokens_seen": 590112, + "step": 33 + }, + { + "epoch": 0.447900466562986, + "grad_norm": 0.7166836261749268, + "learning_rate": 3.6e-05, + "loss": 2.6643, + "num_input_tokens_seen": 645360, + "step": 36 + }, + { + "epoch": 0.48522550544323484, + "grad_norm": 0.8263206481933594, + "learning_rate": 3.9000000000000006e-05, + "loss": 2.5223, + "num_input_tokens_seen": 699664, + "step": 39 + }, + { + "epoch": 0.5225505443234837, + "grad_norm": 1.0774929523468018, + "learning_rate": 4.2e-05, + "loss": 2.3975, + "num_input_tokens_seen": 752640, + "step": 42 + }, + { + "epoch": 0.5598755832037325, + "grad_norm": 1.4689879417419434, + "learning_rate": 4.5e-05, + "loss": 2.3851, + "num_input_tokens_seen": 809488, + "step": 45 + }, + { + "epoch": 0.5972006220839814, + "grad_norm": 1.6804075241088867, + "learning_rate": 4.8e-05, + "loss": 2.1509, + "num_input_tokens_seen": 866016, + "step": 48 + }, + { + "epoch": 0.6345256609642301, + "grad_norm": 1.5407774448394775, + "learning_rate": 5.1000000000000006e-05, + "loss": 1.9498, + "num_input_tokens_seen": 922160, + "step": 51 + }, + { + "epoch": 0.671850699844479, + "grad_norm": 0.9005318880081177, + "learning_rate": 5.4000000000000005e-05, + "loss": 1.863, + "num_input_tokens_seen": 976400, + "step": 54 + }, + { + "epoch": 0.7091757387247278, + "grad_norm": 0.4560866355895996, + "learning_rate": 5.6999999999999996e-05, + "loss": 1.6358, + "num_input_tokens_seen": 1028864, + "step": 57 + }, + { + "epoch": 0.7465007776049767, + "grad_norm": 0.4046150743961334, + "learning_rate": 6e-05, + "loss": 1.6125, + "num_input_tokens_seen": 1082896, + "step": 60 + }, + { + "epoch": 0.7838258164852255, + "grad_norm": 0.40974393486976624, + "learning_rate": 6.3e-05, + "loss": 1.7412, + "num_input_tokens_seen": 1141856, + "step": 63 + }, + { + "epoch": 0.8211508553654744, + "grad_norm": 0.38810229301452637, + "learning_rate": 6.6e-05, + "loss": 1.6413, + "num_input_tokens_seen": 1196976, + "step": 66 + }, + { + "epoch": 0.8584758942457231, + "grad_norm": 0.3670073449611664, + "learning_rate": 6.9e-05, + "loss": 1.6965, + "num_input_tokens_seen": 1249696, + "step": 69 + }, + { + "epoch": 0.895800933125972, + "grad_norm": 0.3049280345439911, + "learning_rate": 7.2e-05, + "loss": 1.623, + "num_input_tokens_seen": 1304880, + "step": 72 + }, + { + "epoch": 0.9331259720062208, + "grad_norm": 0.2850935161113739, + "learning_rate": 7.500000000000001e-05, + "loss": 1.5551, + "num_input_tokens_seen": 1359776, + "step": 75 + }, + { + "epoch": 0.9704510108864697, + "grad_norm": 0.262834370136261, + "learning_rate": 7.800000000000001e-05, + "loss": 1.5815, + "num_input_tokens_seen": 1415584, + "step": 78 + }, + { + "epoch": 1.0077760497667185, + "grad_norm": 0.24694491922855377, + "learning_rate": 8.1e-05, + "loss": 1.5404, + "num_input_tokens_seen": 1468720, + "step": 81 + }, + { + "epoch": 1.0451010886469674, + "grad_norm": 0.24158701300621033, + "learning_rate": 8.4e-05, + "loss": 1.5862, + "num_input_tokens_seen": 1525872, + "step": 84 + }, + { + "epoch": 1.0824261275272162, + "grad_norm": 0.2713283598423004, + "learning_rate": 8.7e-05, + "loss": 1.6087, + "num_input_tokens_seen": 1577456, + "step": 87 + }, + { + "epoch": 1.119751166407465, + "grad_norm": 0.23512157797813416, + "learning_rate": 9e-05, + "loss": 1.5819, + "num_input_tokens_seen": 1631152, + "step": 90 + }, + { + "epoch": 1.157076205287714, + "grad_norm": 0.24216856062412262, + "learning_rate": 9.300000000000001e-05, + "loss": 1.6023, + "num_input_tokens_seen": 1692128, + "step": 93 + }, + { + "epoch": 1.1944012441679628, + "grad_norm": 0.22018860280513763, + "learning_rate": 9.6e-05, + "loss": 1.5937, + "num_input_tokens_seen": 1744368, + "step": 96 + }, + { + "epoch": 1.2317262830482114, + "grad_norm": 0.26621294021606445, + "learning_rate": 9.900000000000001e-05, + "loss": 1.588, + "num_input_tokens_seen": 1795904, + "step": 99 + }, + { + "epoch": 1.2690513219284603, + "grad_norm": 0.18780255317687988, + "learning_rate": 9.999878153526974e-05, + "loss": 1.5538, + "num_input_tokens_seen": 1851744, + "step": 102 + }, + { + "epoch": 1.3063763608087091, + "grad_norm": 0.25466451048851013, + "learning_rate": 9.999238475781957e-05, + "loss": 1.5003, + "num_input_tokens_seen": 1904912, + "step": 105 + }, + { + "epoch": 1.343701399688958, + "grad_norm": 0.2715682089328766, + "learning_rate": 9.998050575201771e-05, + "loss": 1.5044, + "num_input_tokens_seen": 1956592, + "step": 108 + }, + { + "epoch": 1.3810264385692068, + "grad_norm": 0.3099578320980072, + "learning_rate": 9.996314582053106e-05, + "loss": 1.5709, + "num_input_tokens_seen": 2008672, + "step": 111 + }, + { + "epoch": 1.4183514774494557, + "grad_norm": 0.23790551722049713, + "learning_rate": 9.99403068670717e-05, + "loss": 1.5742, + "num_input_tokens_seen": 2063056, + "step": 114 + }, + { + "epoch": 1.4556765163297045, + "grad_norm": 0.23921732604503632, + "learning_rate": 9.991199139618827e-05, + "loss": 1.4958, + "num_input_tokens_seen": 2120000, + "step": 117 + }, + { + "epoch": 1.4930015552099534, + "grad_norm": 0.23010362684726715, + "learning_rate": 9.987820251299122e-05, + "loss": 1.5715, + "num_input_tokens_seen": 2173760, + "step": 120 + }, + { + "epoch": 1.5303265940902022, + "grad_norm": 0.21609963476657867, + "learning_rate": 9.983894392281237e-05, + "loss": 1.5582, + "num_input_tokens_seen": 2228944, + "step": 123 + }, + { + "epoch": 1.5676516329704508, + "grad_norm": 0.28609582781791687, + "learning_rate": 9.979421993079852e-05, + "loss": 1.4904, + "num_input_tokens_seen": 2280544, + "step": 126 + }, + { + "epoch": 1.6049766718506997, + "grad_norm": 0.2686341404914856, + "learning_rate": 9.974403544143941e-05, + "loss": 1.4941, + "num_input_tokens_seen": 2337536, + "step": 129 + }, + { + "epoch": 1.6423017107309485, + "grad_norm": 0.3083486557006836, + "learning_rate": 9.968839595802982e-05, + "loss": 1.5594, + "num_input_tokens_seen": 2391008, + "step": 132 + }, + { + "epoch": 1.6796267496111974, + "grad_norm": 0.2757379114627838, + "learning_rate": 9.962730758206611e-05, + "loss": 1.45, + "num_input_tokens_seen": 2445312, + "step": 135 + }, + { + "epoch": 1.7169517884914463, + "grad_norm": 0.3191740810871124, + "learning_rate": 9.956077701257709e-05, + "loss": 1.6545, + "num_input_tokens_seen": 2495616, + "step": 138 + }, + { + "epoch": 1.754276827371695, + "grad_norm": 0.2461022436618805, + "learning_rate": 9.948881154538945e-05, + "loss": 1.5083, + "num_input_tokens_seen": 2553136, + "step": 141 + }, + { + "epoch": 1.791601866251944, + "grad_norm": 0.2785870134830475, + "learning_rate": 9.941141907232765e-05, + "loss": 1.511, + "num_input_tokens_seen": 2607888, + "step": 144 + }, + { + "epoch": 1.8289269051321928, + "grad_norm": 0.2595404088497162, + "learning_rate": 9.932860808034848e-05, + "loss": 1.4864, + "num_input_tokens_seen": 2662928, + "step": 147 + }, + { + "epoch": 1.8662519440124417, + "grad_norm": 0.3087688982486725, + "learning_rate": 9.924038765061042e-05, + "loss": 1.4787, + "num_input_tokens_seen": 2719936, + "step": 150 + }, + { + "epoch": 1.9035769828926905, + "grad_norm": 0.2820245325565338, + "learning_rate": 9.914676745747772e-05, + "loss": 1.4957, + "num_input_tokens_seen": 2774928, + "step": 153 + }, + { + "epoch": 1.9409020217729394, + "grad_norm": 0.29079610109329224, + "learning_rate": 9.904775776745958e-05, + "loss": 1.5388, + "num_input_tokens_seen": 2831584, + "step": 156 + }, + { + "epoch": 1.9782270606531882, + "grad_norm": 0.32320481538772583, + "learning_rate": 9.894336943808426e-05, + "loss": 1.4156, + "num_input_tokens_seen": 2886528, + "step": 159 + }, + { + "epoch": 2.015552099533437, + "grad_norm": 0.31004923582077026, + "learning_rate": 9.88336139167084e-05, + "loss": 1.4933, + "num_input_tokens_seen": 2944048, + "step": 162 + }, + { + "epoch": 2.052877138413686, + "grad_norm": 0.2879312336444855, + "learning_rate": 9.871850323926177e-05, + "loss": 1.4981, + "num_input_tokens_seen": 2996624, + "step": 165 + }, + { + "epoch": 2.0902021772939348, + "grad_norm": 0.32446593046188354, + "learning_rate": 9.859805002892732e-05, + "loss": 1.5245, + "num_input_tokens_seen": 3050672, + "step": 168 + }, + { + "epoch": 2.1275272161741836, + "grad_norm": 0.38426050543785095, + "learning_rate": 9.847226749475695e-05, + "loss": 1.4546, + "num_input_tokens_seen": 3101648, + "step": 171 + }, + { + "epoch": 2.1648522550544325, + "grad_norm": 0.31284961104393005, + "learning_rate": 9.834116943022298e-05, + "loss": 1.4426, + "num_input_tokens_seen": 3158912, + "step": 174 + }, + { + "epoch": 2.2021772939346813, + "grad_norm": 0.3530096709728241, + "learning_rate": 9.820477021170551e-05, + "loss": 1.4767, + "num_input_tokens_seen": 3215280, + "step": 177 + }, + { + "epoch": 2.23950233281493, + "grad_norm": 0.34290698170661926, + "learning_rate": 9.806308479691595e-05, + "loss": 1.4652, + "num_input_tokens_seen": 3270144, + "step": 180 + }, + { + "epoch": 2.276827371695179, + "grad_norm": 0.30220866203308105, + "learning_rate": 9.791612872325667e-05, + "loss": 1.451, + "num_input_tokens_seen": 3326128, + "step": 183 + }, + { + "epoch": 2.314152410575428, + "grad_norm": 0.3330557644367218, + "learning_rate": 9.776391810611718e-05, + "loss": 1.4005, + "num_input_tokens_seen": 3382096, + "step": 186 + }, + { + "epoch": 2.3514774494556763, + "grad_norm": 0.35981714725494385, + "learning_rate": 9.760646963710694e-05, + "loss": 1.5281, + "num_input_tokens_seen": 3432624, + "step": 189 + }, + { + "epoch": 2.3888024883359256, + "grad_norm": 0.41824910044670105, + "learning_rate": 9.744380058222483e-05, + "loss": 1.4811, + "num_input_tokens_seen": 3485216, + "step": 192 + }, + { + "epoch": 2.426127527216174, + "grad_norm": 0.3749221861362457, + "learning_rate": 9.727592877996585e-05, + "loss": 1.4439, + "num_input_tokens_seen": 3536224, + "step": 195 + }, + { + "epoch": 2.463452566096423, + "grad_norm": 0.41196370124816895, + "learning_rate": 9.710287263936484e-05, + "loss": 1.4742, + "num_input_tokens_seen": 3588128, + "step": 198 + }, + { + "epoch": 2.5007776049766717, + "grad_norm": 0.4999885559082031, + "learning_rate": 9.69246511379778e-05, + "loss": 1.4419, + "num_input_tokens_seen": 3644256, + "step": 201 + }, + { + "epoch": 2.5381026438569205, + "grad_norm": 0.4147838354110718, + "learning_rate": 9.674128381980072e-05, + "loss": 1.4283, + "num_input_tokens_seen": 3697680, + "step": 204 + }, + { + "epoch": 2.5754276827371694, + "grad_norm": 0.3952224850654602, + "learning_rate": 9.655279079312642e-05, + "loss": 1.4107, + "num_input_tokens_seen": 3756608, + "step": 207 + }, + { + "epoch": 2.6127527216174182, + "grad_norm": 0.3909386694431305, + "learning_rate": 9.635919272833938e-05, + "loss": 1.4162, + "num_input_tokens_seen": 3811632, + "step": 210 + }, + { + "epoch": 2.650077760497667, + "grad_norm": 0.4344032406806946, + "learning_rate": 9.616051085564906e-05, + "loss": 1.5205, + "num_input_tokens_seen": 3870112, + "step": 213 + }, + { + "epoch": 2.687402799377916, + "grad_norm": 0.49284985661506653, + "learning_rate": 9.595676696276172e-05, + "loss": 1.5375, + "num_input_tokens_seen": 3926352, + "step": 216 + }, + { + "epoch": 2.724727838258165, + "grad_norm": 0.42042234539985657, + "learning_rate": 9.574798339249125e-05, + "loss": 1.4106, + "num_input_tokens_seen": 3980768, + "step": 219 + }, + { + "epoch": 2.7620528771384136, + "grad_norm": 0.5457227826118469, + "learning_rate": 9.553418304030886e-05, + "loss": 1.4377, + "num_input_tokens_seen": 4029792, + "step": 222 + }, + { + "epoch": 2.7993779160186625, + "grad_norm": 0.4127957820892334, + "learning_rate": 9.53153893518325e-05, + "loss": 1.4638, + "num_input_tokens_seen": 4087248, + "step": 225 + }, + { + "epoch": 2.8367029548989113, + "grad_norm": 0.4777499735355377, + "learning_rate": 9.50916263202557e-05, + "loss": 1.469, + "num_input_tokens_seen": 4142912, + "step": 228 + }, + { + "epoch": 2.87402799377916, + "grad_norm": 0.4768824875354767, + "learning_rate": 9.486291848371643e-05, + "loss": 1.4436, + "num_input_tokens_seen": 4198272, + "step": 231 + }, + { + "epoch": 2.911353032659409, + "grad_norm": 0.4727541506290436, + "learning_rate": 9.462929092260628e-05, + "loss": 1.4491, + "num_input_tokens_seen": 4252528, + "step": 234 + }, + { + "epoch": 2.948678071539658, + "grad_norm": 0.46310773491859436, + "learning_rate": 9.439076925682006e-05, + "loss": 1.4245, + "num_input_tokens_seen": 4305472, + "step": 237 + }, + { + "epoch": 2.9860031104199067, + "grad_norm": 0.554982602596283, + "learning_rate": 9.414737964294636e-05, + "loss": 1.5656, + "num_input_tokens_seen": 4360608, + "step": 240 + }, + { + "epoch": 3.0233281493001556, + "grad_norm": 0.4734109342098236, + "learning_rate": 9.389914877139903e-05, + "loss": 1.362, + "num_input_tokens_seen": 4418496, + "step": 243 + }, + { + "epoch": 3.0606531881804044, + "grad_norm": 0.5144374966621399, + "learning_rate": 9.364610386349049e-05, + "loss": 1.4078, + "num_input_tokens_seen": 4471104, + "step": 246 + }, + { + "epoch": 3.0979782270606533, + "grad_norm": 0.4901665449142456, + "learning_rate": 9.338827266844644e-05, + "loss": 1.3074, + "num_input_tokens_seen": 4522432, + "step": 249 + }, + { + "epoch": 3.135303265940902, + "grad_norm": 0.4994266629219055, + "learning_rate": 9.312568346036288e-05, + "loss": 1.379, + "num_input_tokens_seen": 4577216, + "step": 252 + }, + { + "epoch": 3.172628304821151, + "grad_norm": 0.5738364458084106, + "learning_rate": 9.285836503510562e-05, + "loss": 1.3311, + "num_input_tokens_seen": 4628832, + "step": 255 + }, + { + "epoch": 3.2099533437014, + "grad_norm": 0.5080900192260742, + "learning_rate": 9.258634670715238e-05, + "loss": 1.3993, + "num_input_tokens_seen": 4683312, + "step": 258 + }, + { + "epoch": 3.2472783825816487, + "grad_norm": 0.5958444476127625, + "learning_rate": 9.230965830637821e-05, + "loss": 1.3511, + "num_input_tokens_seen": 4739792, + "step": 261 + }, + { + "epoch": 3.2846034214618975, + "grad_norm": 0.7206479907035828, + "learning_rate": 9.202833017478422e-05, + "loss": 1.3009, + "num_input_tokens_seen": 4791312, + "step": 264 + }, + { + "epoch": 3.3219284603421464, + "grad_norm": 0.6154108643531799, + "learning_rate": 9.174239316317033e-05, + "loss": 1.3149, + "num_input_tokens_seen": 4845744, + "step": 267 + }, + { + "epoch": 3.359253499222395, + "grad_norm": 0.7271692156791687, + "learning_rate": 9.145187862775209e-05, + "loss": 1.3126, + "num_input_tokens_seen": 4897472, + "step": 270 + }, + { + "epoch": 3.396578538102644, + "grad_norm": 0.6346867680549622, + "learning_rate": 9.11568184267221e-05, + "loss": 1.341, + "num_input_tokens_seen": 4954272, + "step": 273 + }, + { + "epoch": 3.4339035769828925, + "grad_norm": 0.686673104763031, + "learning_rate": 9.085724491675642e-05, + "loss": 1.2816, + "num_input_tokens_seen": 5010272, + "step": 276 + }, + { + "epoch": 3.4712286158631414, + "grad_norm": 0.6660853624343872, + "learning_rate": 9.055319094946633e-05, + "loss": 1.3563, + "num_input_tokens_seen": 5066032, + "step": 279 + }, + { + "epoch": 3.50855365474339, + "grad_norm": 0.698714554309845, + "learning_rate": 9.02446898677957e-05, + "loss": 1.3326, + "num_input_tokens_seen": 5115488, + "step": 282 + }, + { + "epoch": 3.545878693623639, + "grad_norm": 0.7857383489608765, + "learning_rate": 8.993177550236464e-05, + "loss": 1.4707, + "num_input_tokens_seen": 5174736, + "step": 285 + }, + { + "epoch": 3.583203732503888, + "grad_norm": 0.7507392764091492, + "learning_rate": 8.961448216775954e-05, + "loss": 1.2941, + "num_input_tokens_seen": 5230016, + "step": 288 + }, + { + "epoch": 3.6205287713841368, + "grad_norm": 0.7710234522819519, + "learning_rate": 8.92928446587701e-05, + "loss": 1.3435, + "num_input_tokens_seen": 5289728, + "step": 291 + }, + { + "epoch": 3.6578538102643856, + "grad_norm": 0.7039404511451721, + "learning_rate": 8.896689824657372e-05, + "loss": 1.3954, + "num_input_tokens_seen": 5344176, + "step": 294 + }, + { + "epoch": 3.6951788491446345, + "grad_norm": 0.8286743760108948, + "learning_rate": 8.863667867486756e-05, + "loss": 1.2821, + "num_input_tokens_seen": 5396144, + "step": 297 + }, + { + "epoch": 3.7325038880248833, + "grad_norm": 0.8500336408615112, + "learning_rate": 8.83022221559489e-05, + "loss": 1.3893, + "num_input_tokens_seen": 5448672, + "step": 300 + }, + { + "epoch": 3.769828926905132, + "grad_norm": 0.7117451429367065, + "learning_rate": 8.796356536674403e-05, + "loss": 1.3796, + "num_input_tokens_seen": 5503696, + "step": 303 + }, + { + "epoch": 3.807153965785381, + "grad_norm": 0.7383677959442139, + "learning_rate": 8.762074544478623e-05, + "loss": 1.4454, + "num_input_tokens_seen": 5557888, + "step": 306 + }, + { + "epoch": 3.84447900466563, + "grad_norm": 0.8263104557991028, + "learning_rate": 8.727379998414311e-05, + "loss": 1.3422, + "num_input_tokens_seen": 5609232, + "step": 309 + }, + { + "epoch": 3.8818040435458787, + "grad_norm": 0.6335726976394653, + "learning_rate": 8.692276703129421e-05, + "loss": 1.3338, + "num_input_tokens_seen": 5667072, + "step": 312 + }, + { + "epoch": 3.9191290824261276, + "grad_norm": 0.7418395280838013, + "learning_rate": 8.656768508095853e-05, + "loss": 1.3997, + "num_input_tokens_seen": 5722672, + "step": 315 + }, + { + "epoch": 3.9564541213063764, + "grad_norm": 0.8165796399116516, + "learning_rate": 8.620859307187339e-05, + "loss": 1.434, + "num_input_tokens_seen": 5780096, + "step": 318 + }, + { + "epoch": 3.9937791601866253, + "grad_norm": 0.905696451663971, + "learning_rate": 8.584553038252414e-05, + "loss": 1.3018, + "num_input_tokens_seen": 5835632, + "step": 321 + }, + { + "epoch": 4.031104199066874, + "grad_norm": 0.7397456765174866, + "learning_rate": 8.547853682682604e-05, + "loss": 1.2729, + "num_input_tokens_seen": 5891616, + "step": 324 + }, + { + "epoch": 4.0684292379471225, + "grad_norm": 0.7480332255363464, + "learning_rate": 8.510765264975813e-05, + "loss": 1.1823, + "num_input_tokens_seen": 5942400, + "step": 327 + }, + { + "epoch": 4.105754276827372, + "grad_norm": 0.9288114905357361, + "learning_rate": 8.473291852294987e-05, + "loss": 1.4483, + "num_input_tokens_seen": 5997728, + "step": 330 + }, + { + "epoch": 4.14307931570762, + "grad_norm": 0.9873988032341003, + "learning_rate": 8.435437554022115e-05, + "loss": 1.1807, + "num_input_tokens_seen": 6049312, + "step": 333 + }, + { + "epoch": 4.1804043545878695, + "grad_norm": 0.7703122496604919, + "learning_rate": 8.397206521307584e-05, + "loss": 1.2298, + "num_input_tokens_seen": 6108288, + "step": 336 + }, + { + "epoch": 4.217729393468118, + "grad_norm": 0.890841007232666, + "learning_rate": 8.358602946614951e-05, + "loss": 1.1629, + "num_input_tokens_seen": 6160144, + "step": 339 + }, + { + "epoch": 4.255054432348367, + "grad_norm": 0.8476828932762146, + "learning_rate": 8.319631063261209e-05, + "loss": 1.2641, + "num_input_tokens_seen": 6216672, + "step": 342 + }, + { + "epoch": 4.292379471228616, + "grad_norm": 0.8666311502456665, + "learning_rate": 8.280295144952536e-05, + "loss": 1.2478, + "num_input_tokens_seen": 6271168, + "step": 345 + }, + { + "epoch": 4.329704510108865, + "grad_norm": 1.005199670791626, + "learning_rate": 8.240599505315655e-05, + "loss": 1.2675, + "num_input_tokens_seen": 6326992, + "step": 348 + }, + { + "epoch": 4.367029548989113, + "grad_norm": 0.9311610460281372, + "learning_rate": 8.200548497424778e-05, + "loss": 1.2551, + "num_input_tokens_seen": 6380944, + "step": 351 + }, + { + "epoch": 4.404354587869363, + "grad_norm": 0.8687139749526978, + "learning_rate": 8.160146513324254e-05, + "loss": 1.2011, + "num_input_tokens_seen": 6436144, + "step": 354 + }, + { + "epoch": 4.441679626749611, + "grad_norm": 0.8516527414321899, + "learning_rate": 8.119397983546932e-05, + "loss": 1.1243, + "num_input_tokens_seen": 6487824, + "step": 357 + }, + { + "epoch": 4.47900466562986, + "grad_norm": 0.9878633618354797, + "learning_rate": 8.07830737662829e-05, + "loss": 1.2775, + "num_input_tokens_seen": 6540448, + "step": 360 + }, + { + "epoch": 4.516329704510109, + "grad_norm": 1.2680439949035645, + "learning_rate": 8.036879198616434e-05, + "loss": 1.208, + "num_input_tokens_seen": 6593248, + "step": 363 + }, + { + "epoch": 4.553654743390358, + "grad_norm": 0.96169513463974, + "learning_rate": 7.99511799257793e-05, + "loss": 1.1828, + "num_input_tokens_seen": 6648976, + "step": 366 + }, + { + "epoch": 4.590979782270606, + "grad_norm": 1.041894555091858, + "learning_rate": 7.953028338099627e-05, + "loss": 1.2522, + "num_input_tokens_seen": 6703440, + "step": 369 + }, + { + "epoch": 4.628304821150856, + "grad_norm": 1.0338892936706543, + "learning_rate": 7.910614850786448e-05, + "loss": 1.2285, + "num_input_tokens_seen": 6755360, + "step": 372 + }, + { + "epoch": 4.665629860031104, + "grad_norm": 0.849757969379425, + "learning_rate": 7.86788218175523e-05, + "loss": 1.2556, + "num_input_tokens_seen": 6811440, + "step": 375 + }, + { + "epoch": 4.7029548989113525, + "grad_norm": 0.8915488719940186, + "learning_rate": 7.82483501712469e-05, + "loss": 1.3034, + "num_input_tokens_seen": 6870992, + "step": 378 + }, + { + "epoch": 4.740279937791602, + "grad_norm": 1.2227041721343994, + "learning_rate": 7.781478077501525e-05, + "loss": 1.2352, + "num_input_tokens_seen": 6927120, + "step": 381 + }, + { + "epoch": 4.777604976671851, + "grad_norm": 0.7694201469421387, + "learning_rate": 7.737816117462752e-05, + "loss": 1.2768, + "num_input_tokens_seen": 6983216, + "step": 384 + }, + { + "epoch": 4.8149300155520995, + "grad_norm": 0.880551815032959, + "learning_rate": 7.693853925034315e-05, + "loss": 1.3318, + "num_input_tokens_seen": 7041648, + "step": 387 + }, + { + "epoch": 4.852255054432348, + "grad_norm": 0.9098656177520752, + "learning_rate": 7.649596321166024e-05, + "loss": 1.2715, + "num_input_tokens_seen": 7095104, + "step": 390 + }, + { + "epoch": 4.889580093312597, + "grad_norm": 1.0869756937026978, + "learning_rate": 7.605048159202883e-05, + "loss": 1.1691, + "num_input_tokens_seen": 7146512, + "step": 393 + }, + { + "epoch": 4.926905132192846, + "grad_norm": 1.0197746753692627, + "learning_rate": 7.560214324352858e-05, + "loss": 1.2651, + "num_input_tokens_seen": 7198704, + "step": 396 + }, + { + "epoch": 4.964230171073095, + "grad_norm": 0.8996115326881409, + "learning_rate": 7.515099733151177e-05, + "loss": 1.3019, + "num_input_tokens_seen": 7254608, + "step": 399 + }, + { + "epoch": 5.001555209953343, + "grad_norm": 0.9211968779563904, + "learning_rate": 7.469709332921155e-05, + "loss": 1.2474, + "num_input_tokens_seen": 7312224, + "step": 402 + }, + { + "epoch": 5.038880248833593, + "grad_norm": 0.8490816354751587, + "learning_rate": 7.424048101231686e-05, + "loss": 1.1313, + "num_input_tokens_seen": 7365456, + "step": 405 + }, + { + "epoch": 5.076205287713841, + "grad_norm": 1.145821452140808, + "learning_rate": 7.378121045351378e-05, + "loss": 1.265, + "num_input_tokens_seen": 7421200, + "step": 408 + }, + { + "epoch": 5.11353032659409, + "grad_norm": 1.3000530004501343, + "learning_rate": 7.331933201699457e-05, + "loss": 1.1876, + "num_input_tokens_seen": 7474704, + "step": 411 + }, + { + "epoch": 5.150855365474339, + "grad_norm": 1.0333030223846436, + "learning_rate": 7.285489635293472e-05, + "loss": 1.063, + "num_input_tokens_seen": 7527856, + "step": 414 + }, + { + "epoch": 5.188180404354588, + "grad_norm": 1.1004230976104736, + "learning_rate": 7.238795439193848e-05, + "loss": 1.1132, + "num_input_tokens_seen": 7585040, + "step": 417 + }, + { + "epoch": 5.2255054432348365, + "grad_norm": 1.0957094430923462, + "learning_rate": 7.191855733945387e-05, + "loss": 1.1264, + "num_input_tokens_seen": 7643952, + "step": 420 + }, + { + "epoch": 5.262830482115086, + "grad_norm": 1.0807350873947144, + "learning_rate": 7.14467566701573e-05, + "loss": 1.1316, + "num_input_tokens_seen": 7699952, + "step": 423 + }, + { + "epoch": 5.300155520995334, + "grad_norm": 1.1621692180633545, + "learning_rate": 7.097260412230886e-05, + "loss": 1.1275, + "num_input_tokens_seen": 7754384, + "step": 426 + }, + { + "epoch": 5.3374805598755835, + "grad_norm": 1.1265610456466675, + "learning_rate": 7.049615169207864e-05, + "loss": 1.1736, + "num_input_tokens_seen": 7811760, + "step": 429 + }, + { + "epoch": 5.374805598755832, + "grad_norm": 1.049169659614563, + "learning_rate": 7.001745162784477e-05, + "loss": 1.1063, + "num_input_tokens_seen": 7867376, + "step": 432 + }, + { + "epoch": 5.412130637636081, + "grad_norm": 1.2782608270645142, + "learning_rate": 6.953655642446368e-05, + "loss": 1.0924, + "num_input_tokens_seen": 7920736, + "step": 435 + }, + { + "epoch": 5.44945567651633, + "grad_norm": 1.0864709615707397, + "learning_rate": 6.905351881751372e-05, + "loss": 1.1583, + "num_input_tokens_seen": 7975088, + "step": 438 + }, + { + "epoch": 5.486780715396579, + "grad_norm": 1.2666634321212769, + "learning_rate": 6.856839177751176e-05, + "loss": 1.0812, + "num_input_tokens_seen": 8027328, + "step": 441 + }, + { + "epoch": 5.524105754276827, + "grad_norm": 0.9478985667228699, + "learning_rate": 6.808122850410461e-05, + "loss": 1.0749, + "num_input_tokens_seen": 8083392, + "step": 444 + }, + { + "epoch": 5.561430793157077, + "grad_norm": 1.4468153715133667, + "learning_rate": 6.759208242023509e-05, + "loss": 1.1026, + "num_input_tokens_seen": 8134800, + "step": 447 + }, + { + "epoch": 5.598755832037325, + "grad_norm": 1.2189656496047974, + "learning_rate": 6.710100716628344e-05, + "loss": 1.092, + "num_input_tokens_seen": 8188944, + "step": 450 + }, + { + "epoch": 5.636080870917574, + "grad_norm": 1.040446400642395, + "learning_rate": 6.660805659418516e-05, + "loss": 1.2196, + "num_input_tokens_seen": 8248096, + "step": 453 + }, + { + "epoch": 5.673405909797823, + "grad_norm": 1.137902021408081, + "learning_rate": 6.611328476152557e-05, + "loss": 1.1435, + "num_input_tokens_seen": 8302864, + "step": 456 + }, + { + "epoch": 5.710730948678071, + "grad_norm": 1.19194757938385, + "learning_rate": 6.561674592561163e-05, + "loss": 1.0955, + "num_input_tokens_seen": 8356800, + "step": 459 + }, + { + "epoch": 5.74805598755832, + "grad_norm": 1.2011014223098755, + "learning_rate": 6.511849453752223e-05, + "loss": 1.2012, + "num_input_tokens_seen": 8410576, + "step": 462 + }, + { + "epoch": 5.78538102643857, + "grad_norm": 1.2967370748519897, + "learning_rate": 6.461858523613684e-05, + "loss": 1.1735, + "num_input_tokens_seen": 8468016, + "step": 465 + }, + { + "epoch": 5.822706065318818, + "grad_norm": 1.2468522787094116, + "learning_rate": 6.411707284214384e-05, + "loss": 1.1583, + "num_input_tokens_seen": 8518768, + "step": 468 + }, + { + "epoch": 5.8600311041990665, + "grad_norm": 1.3397902250289917, + "learning_rate": 6.361401235202872e-05, + "loss": 1.1513, + "num_input_tokens_seen": 8573872, + "step": 471 + }, + { + "epoch": 5.897356143079316, + "grad_norm": 1.1201503276824951, + "learning_rate": 6.310945893204324e-05, + "loss": 1.06, + "num_input_tokens_seen": 8628928, + "step": 474 + }, + { + "epoch": 5.934681181959564, + "grad_norm": 1.3698691129684448, + "learning_rate": 6.26034679121557e-05, + "loss": 1.1553, + "num_input_tokens_seen": 8681104, + "step": 477 + }, + { + "epoch": 5.9720062208398135, + "grad_norm": 1.2262195348739624, + "learning_rate": 6.209609477998338e-05, + "loss": 1.1209, + "num_input_tokens_seen": 8735984, + "step": 480 + }, + { + "epoch": 6.009331259720062, + "grad_norm": 0.9644012451171875, + "learning_rate": 6.158739517470786e-05, + "loss": 1.1008, + "num_input_tokens_seen": 8790416, + "step": 483 + }, + { + "epoch": 6.046656298600311, + "grad_norm": 1.0709476470947266, + "learning_rate": 6.107742488097338e-05, + "loss": 1.0679, + "num_input_tokens_seen": 8848464, + "step": 486 + }, + { + "epoch": 6.08398133748056, + "grad_norm": 1.0696582794189453, + "learning_rate": 6.056623982276944e-05, + "loss": 1.0223, + "num_input_tokens_seen": 8900736, + "step": 489 + }, + { + "epoch": 6.121306376360809, + "grad_norm": 1.2330180406570435, + "learning_rate": 6.005389605729824e-05, + "loss": 1.0493, + "num_input_tokens_seen": 8957616, + "step": 492 + }, + { + "epoch": 6.158631415241057, + "grad_norm": 1.1921484470367432, + "learning_rate": 5.9540449768827246e-05, + "loss": 1.0017, + "num_input_tokens_seen": 9013680, + "step": 495 + }, + { + "epoch": 6.195956454121307, + "grad_norm": 1.383009433746338, + "learning_rate": 5.902595726252801e-05, + "loss": 1.0215, + "num_input_tokens_seen": 9069536, + "step": 498 + }, + { + "epoch": 6.2208398133748055, + "eval_loss": 1.6311813592910767, + "eval_runtime": 10.0618, + "eval_samples_per_second": 99.386, + "eval_steps_per_second": 7.156, + "num_input_tokens_seen": 9110112, + "step": 500 + }, + { + "epoch": 6.233281493001555, + "grad_norm": 1.2158509492874146, + "learning_rate": 5.851047495830163e-05, + "loss": 1.0748, + "num_input_tokens_seen": 9127472, + "step": 501 + }, + { + "epoch": 6.270606531881804, + "grad_norm": 1.2159379720687866, + "learning_rate": 5.799405938459175e-05, + "loss": 0.9791, + "num_input_tokens_seen": 9176608, + "step": 504 + }, + { + "epoch": 6.307931570762053, + "grad_norm": 1.4739444255828857, + "learning_rate": 5.747676717218549e-05, + "loss": 1.0378, + "num_input_tokens_seen": 9228576, + "step": 507 + }, + { + "epoch": 6.345256609642302, + "grad_norm": 1.128970980644226, + "learning_rate": 5.695865504800327e-05, + "loss": 1.0241, + "num_input_tokens_seen": 9286272, + "step": 510 + }, + { + "epoch": 6.38258164852255, + "grad_norm": 1.5108153820037842, + "learning_rate": 5.643977982887815e-05, + "loss": 1.0465, + "num_input_tokens_seen": 9342096, + "step": 513 + }, + { + "epoch": 6.4199066874028, + "grad_norm": 1.2417353391647339, + "learning_rate": 5.5920198415325064e-05, + "loss": 1.0456, + "num_input_tokens_seen": 9396240, + "step": 516 + }, + { + "epoch": 6.457231726283048, + "grad_norm": 1.093074083328247, + "learning_rate": 5.539996778530115e-05, + "loss": 1.0381, + "num_input_tokens_seen": 9451488, + "step": 519 + }, + { + "epoch": 6.494556765163297, + "grad_norm": 1.3556379079818726, + "learning_rate": 5.487914498795747e-05, + "loss": 1.1424, + "num_input_tokens_seen": 9506784, + "step": 522 + }, + { + "epoch": 6.531881804043546, + "grad_norm": 1.4488335847854614, + "learning_rate": 5.435778713738292e-05, + "loss": 1.0076, + "num_input_tokens_seen": 9562448, + "step": 525 + }, + { + "epoch": 6.569206842923795, + "grad_norm": 1.2876710891723633, + "learning_rate": 5.383595140634093e-05, + "loss": 1.0462, + "num_input_tokens_seen": 9615664, + "step": 528 + }, + { + "epoch": 6.6065318818040435, + "grad_norm": 1.330777645111084, + "learning_rate": 5.3313695020000024e-05, + "loss": 1.0512, + "num_input_tokens_seen": 9671744, + "step": 531 + }, + { + "epoch": 6.643856920684293, + "grad_norm": 1.5395786762237549, + "learning_rate": 5.279107524965819e-05, + "loss": 0.9775, + "num_input_tokens_seen": 9724048, + "step": 534 + }, + { + "epoch": 6.681181959564541, + "grad_norm": 1.4420833587646484, + "learning_rate": 5.226814940646269e-05, + "loss": 1.0258, + "num_input_tokens_seen": 9776176, + "step": 537 + }, + { + "epoch": 6.71850699844479, + "grad_norm": 1.342041254043579, + "learning_rate": 5.174497483512506e-05, + "loss": 1.0739, + "num_input_tokens_seen": 9832192, + "step": 540 + }, + { + "epoch": 6.755832037325039, + "grad_norm": 1.5209355354309082, + "learning_rate": 5.1221608907632665e-05, + "loss": 1.0348, + "num_input_tokens_seen": 9886048, + "step": 543 + }, + { + "epoch": 6.793157076205288, + "grad_norm": 1.2620528936386108, + "learning_rate": 5.0698109016957274e-05, + "loss": 1.0928, + "num_input_tokens_seen": 9939184, + "step": 546 + }, + { + "epoch": 6.830482115085537, + "grad_norm": 1.2428035736083984, + "learning_rate": 5.017453257076119e-05, + "loss": 1.004, + "num_input_tokens_seen": 9992272, + "step": 549 + }, + { + "epoch": 6.867807153965785, + "grad_norm": 1.2153327465057373, + "learning_rate": 4.965093698510193e-05, + "loss": 0.9519, + "num_input_tokens_seen": 10045168, + "step": 552 + }, + { + "epoch": 6.905132192846034, + "grad_norm": 1.3050085306167603, + "learning_rate": 4.912737967813583e-05, + "loss": 1.1466, + "num_input_tokens_seen": 10102720, + "step": 555 + }, + { + "epoch": 6.942457231726283, + "grad_norm": 1.3073248863220215, + "learning_rate": 4.860391806382157e-05, + "loss": 1.0908, + "num_input_tokens_seen": 10156832, + "step": 558 + }, + { + "epoch": 6.979782270606532, + "grad_norm": 1.280698299407959, + "learning_rate": 4.8080609545624004e-05, + "loss": 0.9568, + "num_input_tokens_seen": 10210208, + "step": 561 + }, + { + "epoch": 7.01710730948678, + "grad_norm": 1.1008414030075073, + "learning_rate": 4.755751151021934e-05, + "loss": 1.0496, + "num_input_tokens_seen": 10267456, + "step": 564 + }, + { + "epoch": 7.05443234836703, + "grad_norm": 1.4668625593185425, + "learning_rate": 4.703468132120193e-05, + "loss": 0.8353, + "num_input_tokens_seen": 10317312, + "step": 567 + }, + { + "epoch": 7.091757387247278, + "grad_norm": 1.4211206436157227, + "learning_rate": 4.6512176312793736e-05, + "loss": 0.9929, + "num_input_tokens_seen": 10373232, + "step": 570 + }, + { + "epoch": 7.129082426127527, + "grad_norm": 1.2859231233596802, + "learning_rate": 4.599005378355706e-05, + "loss": 1.0196, + "num_input_tokens_seen": 10429936, + "step": 573 + }, + { + "epoch": 7.166407465007776, + "grad_norm": 1.291049599647522, + "learning_rate": 4.5468370990111006e-05, + "loss": 0.9094, + "num_input_tokens_seen": 10485824, + "step": 576 + }, + { + "epoch": 7.203732503888025, + "grad_norm": 1.2573702335357666, + "learning_rate": 4.494718514085268e-05, + "loss": 0.9206, + "num_input_tokens_seen": 10540736, + "step": 579 + }, + { + "epoch": 7.2410575427682735, + "grad_norm": 1.6116251945495605, + "learning_rate": 4.442655338968373e-05, + "loss": 1.0173, + "num_input_tokens_seen": 10596192, + "step": 582 + }, + { + "epoch": 7.278382581648523, + "grad_norm": 1.2712162733078003, + "learning_rate": 4.390653282974264e-05, + "loss": 1.0234, + "num_input_tokens_seen": 10651376, + "step": 585 + }, + { + "epoch": 7.315707620528771, + "grad_norm": 1.3022940158843994, + "learning_rate": 4.3387180487143876e-05, + "loss": 0.9771, + "num_input_tokens_seen": 10704512, + "step": 588 + }, + { + "epoch": 7.3530326594090205, + "grad_norm": 1.3681883811950684, + "learning_rate": 4.2868553314724425e-05, + "loss": 0.8928, + "num_input_tokens_seen": 10758656, + "step": 591 + }, + { + "epoch": 7.390357698289269, + "grad_norm": 1.2594826221466064, + "learning_rate": 4.23507081857981e-05, + "loss": 1.0248, + "num_input_tokens_seen": 10814112, + "step": 594 + }, + { + "epoch": 7.427682737169518, + "grad_norm": 1.401910424232483, + "learning_rate": 4.1833701887918904e-05, + "loss": 0.8644, + "num_input_tokens_seen": 10869264, + "step": 597 + }, + { + "epoch": 7.465007776049767, + "grad_norm": 1.2500008344650269, + "learning_rate": 4.131759111665349e-05, + "loss": 0.9704, + "num_input_tokens_seen": 10922736, + "step": 600 + }, + { + "epoch": 7.502332814930016, + "grad_norm": 1.280358076095581, + "learning_rate": 4.080243246936399e-05, + "loss": 1.0462, + "num_input_tokens_seen": 10979712, + "step": 603 + }, + { + "epoch": 7.539657853810264, + "grad_norm": 1.3356202840805054, + "learning_rate": 4.028828243900141e-05, + "loss": 0.9357, + "num_input_tokens_seen": 11036672, + "step": 606 + }, + { + "epoch": 7.576982892690513, + "grad_norm": 1.3757556676864624, + "learning_rate": 3.9775197407910485e-05, + "loss": 0.9906, + "num_input_tokens_seen": 11092496, + "step": 609 + }, + { + "epoch": 7.614307931570762, + "grad_norm": 1.389939308166504, + "learning_rate": 3.926323364164684e-05, + "loss": 0.9157, + "num_input_tokens_seen": 11143456, + "step": 612 + }, + { + "epoch": 7.651632970451011, + "grad_norm": 1.3326400518417358, + "learning_rate": 3.875244728280676e-05, + "loss": 0.9151, + "num_input_tokens_seen": 11200064, + "step": 615 + }, + { + "epoch": 7.68895800933126, + "grad_norm": 1.2925583124160767, + "learning_rate": 3.82428943448705e-05, + "loss": 1.0072, + "num_input_tokens_seen": 11256752, + "step": 618 + }, + { + "epoch": 7.726283048211508, + "grad_norm": 1.608323574066162, + "learning_rate": 3.773463070605987e-05, + "loss": 0.9138, + "num_input_tokens_seen": 11311200, + "step": 621 + }, + { + "epoch": 7.763608087091757, + "grad_norm": 1.2572425603866577, + "learning_rate": 3.7227712103210486e-05, + "loss": 0.9126, + "num_input_tokens_seen": 11362640, + "step": 624 + }, + { + "epoch": 7.800933125972006, + "grad_norm": 1.366409420967102, + "learning_rate": 3.6722194125659556e-05, + "loss": 0.921, + "num_input_tokens_seen": 11420080, + "step": 627 + }, + { + "epoch": 7.838258164852255, + "grad_norm": 1.367814302444458, + "learning_rate": 3.6218132209150045e-05, + "loss": 0.9167, + "num_input_tokens_seen": 11471056, + "step": 630 + }, + { + "epoch": 7.8755832037325035, + "grad_norm": 1.4953104257583618, + "learning_rate": 3.5715581629751326e-05, + "loss": 1.0907, + "num_input_tokens_seen": 11526928, + "step": 633 + }, + { + "epoch": 7.912908242612753, + "grad_norm": 1.4320324659347534, + "learning_rate": 3.5214597497797684e-05, + "loss": 0.9222, + "num_input_tokens_seen": 11580928, + "step": 636 + }, + { + "epoch": 7.950233281493001, + "grad_norm": 1.6235154867172241, + "learning_rate": 3.471523475184472e-05, + "loss": 0.9267, + "num_input_tokens_seen": 11634416, + "step": 639 + }, + { + "epoch": 7.9875583203732505, + "grad_norm": 1.4394381046295166, + "learning_rate": 3.4217548152644885e-05, + "loss": 0.9974, + "num_input_tokens_seen": 11688928, + "step": 642 + }, + { + "epoch": 8.024883359253499, + "grad_norm": 1.2756644487380981, + "learning_rate": 3.372159227714218e-05, + "loss": 0.924, + "num_input_tokens_seen": 11741968, + "step": 645 + }, + { + "epoch": 8.062208398133748, + "grad_norm": 1.3790043592453003, + "learning_rate": 3.322742151248725e-05, + "loss": 0.7697, + "num_input_tokens_seen": 11794432, + "step": 648 + }, + { + "epoch": 8.099533437013998, + "grad_norm": 1.3841499090194702, + "learning_rate": 3.273509005007327e-05, + "loss": 0.9866, + "num_input_tokens_seen": 11849744, + "step": 651 + }, + { + "epoch": 8.136858475894245, + "grad_norm": 1.5157493352890015, + "learning_rate": 3.224465187959316e-05, + "loss": 0.9879, + "num_input_tokens_seen": 11904800, + "step": 654 + }, + { + "epoch": 8.174183514774494, + "grad_norm": 1.4144805669784546, + "learning_rate": 3.1756160783119016e-05, + "loss": 0.8112, + "num_input_tokens_seen": 11960208, + "step": 657 + }, + { + "epoch": 8.211508553654744, + "grad_norm": 1.2442865371704102, + "learning_rate": 3.12696703292044e-05, + "loss": 0.8722, + "num_input_tokens_seen": 12012304, + "step": 660 + }, + { + "epoch": 8.248833592534993, + "grad_norm": 1.4178701639175415, + "learning_rate": 3.078523386700982e-05, + "loss": 0.9897, + "num_input_tokens_seen": 12067760, + "step": 663 + }, + { + "epoch": 8.28615863141524, + "grad_norm": 1.380922794342041, + "learning_rate": 3.0302904520452447e-05, + "loss": 0.9071, + "num_input_tokens_seen": 12127248, + "step": 666 + }, + { + "epoch": 8.32348367029549, + "grad_norm": 1.3610669374465942, + "learning_rate": 2.9822735182380496e-05, + "loss": 0.8776, + "num_input_tokens_seen": 12183520, + "step": 669 + }, + { + "epoch": 8.360808709175739, + "grad_norm": 1.3397613763809204, + "learning_rate": 2.934477850877292e-05, + "loss": 0.9587, + "num_input_tokens_seen": 12240960, + "step": 672 + }, + { + "epoch": 8.398133748055988, + "grad_norm": 1.259645938873291, + "learning_rate": 2.886908691296504e-05, + "loss": 0.8807, + "num_input_tokens_seen": 12295600, + "step": 675 + }, + { + "epoch": 8.435458786936236, + "grad_norm": 1.4922188520431519, + "learning_rate": 2.8395712559900877e-05, + "loss": 0.8717, + "num_input_tokens_seen": 12349072, + "step": 678 + }, + { + "epoch": 8.472783825816485, + "grad_norm": 1.2398678064346313, + "learning_rate": 2.7924707360412746e-05, + "loss": 0.8578, + "num_input_tokens_seen": 12403360, + "step": 681 + }, + { + "epoch": 8.510108864696734, + "grad_norm": 1.421632170677185, + "learning_rate": 2.7456122965528475e-05, + "loss": 0.9193, + "num_input_tokens_seen": 12462624, + "step": 684 + }, + { + "epoch": 8.547433903576984, + "grad_norm": 1.4103708267211914, + "learning_rate": 2.699001076080742e-05, + "loss": 0.909, + "num_input_tokens_seen": 12519456, + "step": 687 + }, + { + "epoch": 8.584758942457231, + "grad_norm": 1.4057148694992065, + "learning_rate": 2.6526421860705473e-05, + "loss": 0.9335, + "num_input_tokens_seen": 12573888, + "step": 690 + }, + { + "epoch": 8.62208398133748, + "grad_norm": 1.882209300994873, + "learning_rate": 2.6065407102969664e-05, + "loss": 0.8227, + "num_input_tokens_seen": 12627520, + "step": 693 + }, + { + "epoch": 8.65940902021773, + "grad_norm": 1.6502835750579834, + "learning_rate": 2.560701704306336e-05, + "loss": 0.8731, + "num_input_tokens_seen": 12680768, + "step": 696 + }, + { + "epoch": 8.696734059097977, + "grad_norm": 1.6794430017471313, + "learning_rate": 2.5151301948622237e-05, + "loss": 0.8538, + "num_input_tokens_seen": 12734640, + "step": 699 + }, + { + "epoch": 8.734059097978227, + "grad_norm": 1.4031795263290405, + "learning_rate": 2.469831179394182e-05, + "loss": 0.8654, + "num_input_tokens_seen": 12787200, + "step": 702 + }, + { + "epoch": 8.771384136858476, + "grad_norm": 1.460951328277588, + "learning_rate": 2.4248096254497288e-05, + "loss": 0.8738, + "num_input_tokens_seen": 12840064, + "step": 705 + }, + { + "epoch": 8.808709175738725, + "grad_norm": 1.3095109462738037, + "learning_rate": 2.3800704701496053e-05, + "loss": 0.9721, + "num_input_tokens_seen": 12895776, + "step": 708 + }, + { + "epoch": 8.846034214618973, + "grad_norm": 1.21684992313385, + "learning_rate": 2.33561861964635e-05, + "loss": 0.9084, + "num_input_tokens_seen": 12946496, + "step": 711 + }, + { + "epoch": 8.883359253499222, + "grad_norm": 1.4043666124343872, + "learning_rate": 2.2914589485863014e-05, + "loss": 0.8723, + "num_input_tokens_seen": 12999616, + "step": 714 + }, + { + "epoch": 8.920684292379471, + "grad_norm": 1.6057589054107666, + "learning_rate": 2.247596299575022e-05, + "loss": 0.8646, + "num_input_tokens_seen": 13053840, + "step": 717 + }, + { + "epoch": 8.95800933125972, + "grad_norm": 1.4530081748962402, + "learning_rate": 2.2040354826462668e-05, + "loss": 0.869, + "num_input_tokens_seen": 13107104, + "step": 720 + }, + { + "epoch": 8.995334370139968, + "grad_norm": 1.2964198589324951, + "learning_rate": 2.160781274734495e-05, + "loss": 0.8733, + "num_input_tokens_seen": 13162592, + "step": 723 + }, + { + "epoch": 9.032659409020217, + "grad_norm": 1.2949548959732056, + "learning_rate": 2.117838419151034e-05, + "loss": 0.9076, + "num_input_tokens_seen": 13217488, + "step": 726 + }, + { + "epoch": 9.069984447900467, + "grad_norm": 1.4165751934051514, + "learning_rate": 2.0752116250639225e-05, + "loss": 0.828, + "num_input_tokens_seen": 13270704, + "step": 729 + }, + { + "epoch": 9.107309486780716, + "grad_norm": 1.1998887062072754, + "learning_rate": 2.0329055669814934e-05, + "loss": 0.8922, + "num_input_tokens_seen": 13325168, + "step": 732 + }, + { + "epoch": 9.144634525660964, + "grad_norm": 1.4165197610855103, + "learning_rate": 1.9909248842397584e-05, + "loss": 0.8684, + "num_input_tokens_seen": 13385488, + "step": 735 + }, + { + "epoch": 9.181959564541213, + "grad_norm": 1.4951928853988647, + "learning_rate": 1.9492741804936622e-05, + "loss": 0.8703, + "num_input_tokens_seen": 13439872, + "step": 738 + }, + { + "epoch": 9.219284603421462, + "grad_norm": 1.4441372156143188, + "learning_rate": 1.9079580232122303e-05, + "loss": 0.8381, + "num_input_tokens_seen": 13494080, + "step": 741 + }, + { + "epoch": 9.256609642301711, + "grad_norm": 1.3826444149017334, + "learning_rate": 1.866980943177699e-05, + "loss": 0.9077, + "num_input_tokens_seen": 13552208, + "step": 744 + }, + { + "epoch": 9.293934681181959, + "grad_norm": 1.3692766427993774, + "learning_rate": 1.8263474339886628e-05, + "loss": 0.8887, + "num_input_tokens_seen": 13608832, + "step": 747 + }, + { + "epoch": 9.331259720062208, + "grad_norm": 1.2762523889541626, + "learning_rate": 1.7860619515673033e-05, + "loss": 0.8205, + "num_input_tokens_seen": 13663760, + "step": 750 + }, + { + "epoch": 9.368584758942458, + "grad_norm": 1.3160815238952637, + "learning_rate": 1.746128913670746e-05, + "loss": 0.8274, + "num_input_tokens_seen": 13716800, + "step": 753 + }, + { + "epoch": 9.405909797822707, + "grad_norm": 1.5260809659957886, + "learning_rate": 1.7065526994065973e-05, + "loss": 0.9119, + "num_input_tokens_seen": 13774816, + "step": 756 + }, + { + "epoch": 9.443234836702954, + "grad_norm": 1.2529041767120361, + "learning_rate": 1.667337648752738e-05, + "loss": 0.8099, + "num_input_tokens_seen": 13830048, + "step": 759 + }, + { + "epoch": 9.480559875583204, + "grad_norm": 1.3622620105743408, + "learning_rate": 1.6284880620813848e-05, + "loss": 0.8317, + "num_input_tokens_seen": 13885008, + "step": 762 + }, + { + "epoch": 9.517884914463453, + "grad_norm": 1.245958924293518, + "learning_rate": 1.5900081996875083e-05, + "loss": 0.7895, + "num_input_tokens_seen": 13939504, + "step": 765 + }, + { + "epoch": 9.555209953343702, + "grad_norm": 1.214984655380249, + "learning_rate": 1.551902281321651e-05, + "loss": 0.8474, + "num_input_tokens_seen": 13992352, + "step": 768 + }, + { + "epoch": 9.59253499222395, + "grad_norm": 1.4553762674331665, + "learning_rate": 1.5141744857271778e-05, + "loss": 0.8418, + "num_input_tokens_seen": 14044656, + "step": 771 + }, + { + "epoch": 9.629860031104199, + "grad_norm": 1.6518710851669312, + "learning_rate": 1.4768289501820265e-05, + "loss": 0.7605, + "num_input_tokens_seen": 14095664, + "step": 774 + }, + { + "epoch": 9.667185069984448, + "grad_norm": 1.6158061027526855, + "learning_rate": 1.439869770045018e-05, + "loss": 0.8928, + "num_input_tokens_seen": 14151808, + "step": 777 + }, + { + "epoch": 9.704510108864696, + "grad_norm": 1.4202730655670166, + "learning_rate": 1.4033009983067452e-05, + "loss": 0.7745, + "num_input_tokens_seen": 14202128, + "step": 780 + }, + { + "epoch": 9.741835147744945, + "grad_norm": 1.5133007764816284, + "learning_rate": 1.367126645145121e-05, + "loss": 0.8861, + "num_input_tokens_seen": 14254320, + "step": 783 + }, + { + "epoch": 9.779160186625194, + "grad_norm": 1.3368923664093018, + "learning_rate": 1.3313506774856177e-05, + "loss": 0.7954, + "num_input_tokens_seen": 14307104, + "step": 786 + }, + { + "epoch": 9.816485225505444, + "grad_norm": 1.5365697145462036, + "learning_rate": 1.29597701856625e-05, + "loss": 0.8982, + "num_input_tokens_seen": 14364592, + "step": 789 + }, + { + "epoch": 9.853810264385691, + "grad_norm": 1.4836835861206055, + "learning_rate": 1.2610095475073414e-05, + "loss": 0.8608, + "num_input_tokens_seen": 14417696, + "step": 792 + }, + { + "epoch": 9.89113530326594, + "grad_norm": 1.3931705951690674, + "learning_rate": 1.22645209888614e-05, + "loss": 0.8443, + "num_input_tokens_seen": 14472720, + "step": 795 + }, + { + "epoch": 9.92846034214619, + "grad_norm": 1.2984020709991455, + "learning_rate": 1.1923084623163172e-05, + "loss": 0.8242, + "num_input_tokens_seen": 14526336, + "step": 798 + }, + { + "epoch": 9.96578538102644, + "grad_norm": 1.428997278213501, + "learning_rate": 1.1585823820323843e-05, + "loss": 0.7819, + "num_input_tokens_seen": 14577424, + "step": 801 + }, + { + "epoch": 10.003110419906687, + "grad_norm": 1.3891069889068604, + "learning_rate": 1.1252775564791024e-05, + "loss": 0.8295, + "num_input_tokens_seen": 14633552, + "step": 804 + }, + { + "epoch": 10.040435458786936, + "grad_norm": 1.2659319639205933, + "learning_rate": 1.0923976379059058e-05, + "loss": 0.8614, + "num_input_tokens_seen": 14690976, + "step": 807 + }, + { + "epoch": 10.077760497667185, + "grad_norm": 1.2245172262191772, + "learning_rate": 1.0599462319663905e-05, + "loss": 0.7815, + "num_input_tokens_seen": 14742128, + "step": 810 + }, + { + "epoch": 10.115085536547435, + "grad_norm": 1.368401288986206, + "learning_rate": 1.0279268973229089e-05, + "loss": 0.7795, + "num_input_tokens_seen": 14794288, + "step": 813 + }, + { + "epoch": 10.152410575427682, + "grad_norm": 1.4876455068588257, + "learning_rate": 9.963431452563332e-06, + "loss": 0.7585, + "num_input_tokens_seen": 14846560, + "step": 816 + }, + { + "epoch": 10.189735614307931, + "grad_norm": 1.377921223640442, + "learning_rate": 9.651984392809914e-06, + "loss": 0.7785, + "num_input_tokens_seen": 14900528, + "step": 819 + }, + { + "epoch": 10.22706065318818, + "grad_norm": 1.3406357765197754, + "learning_rate": 9.344961947648623e-06, + "loss": 0.848, + "num_input_tokens_seen": 14956624, + "step": 822 + }, + { + "epoch": 10.26438569206843, + "grad_norm": 1.312232494354248, + "learning_rate": 9.042397785550405e-06, + "loss": 0.8245, + "num_input_tokens_seen": 15013488, + "step": 825 + }, + { + "epoch": 10.301710730948678, + "grad_norm": 1.317514419555664, + "learning_rate": 8.744325086085248e-06, + "loss": 0.811, + "num_input_tokens_seen": 15068000, + "step": 828 + }, + { + "epoch": 10.339035769828927, + "grad_norm": 1.4466396570205688, + "learning_rate": 8.450776536283594e-06, + "loss": 0.8597, + "num_input_tokens_seen": 15124672, + "step": 831 + }, + { + "epoch": 10.376360808709176, + "grad_norm": 1.323728084564209, + "learning_rate": 8.16178432705192e-06, + "loss": 0.7463, + "num_input_tokens_seen": 15174960, + "step": 834 + }, + { + "epoch": 10.413685847589425, + "grad_norm": 1.355770468711853, + "learning_rate": 7.877380149642626e-06, + "loss": 0.7385, + "num_input_tokens_seen": 15228192, + "step": 837 + }, + { + "epoch": 10.451010886469673, + "grad_norm": 1.2545582056045532, + "learning_rate": 7.597595192178702e-06, + "loss": 0.8362, + "num_input_tokens_seen": 15282256, + "step": 840 + }, + { + "epoch": 10.488335925349922, + "grad_norm": 1.336365818977356, + "learning_rate": 7.322460136233622e-06, + "loss": 0.7866, + "num_input_tokens_seen": 15337392, + "step": 843 + }, + { + "epoch": 10.525660964230172, + "grad_norm": 1.2446874380111694, + "learning_rate": 7.052005153466779e-06, + "loss": 0.8247, + "num_input_tokens_seen": 15392448, + "step": 846 + }, + { + "epoch": 10.56298600311042, + "grad_norm": 1.2695002555847168, + "learning_rate": 6.786259902314768e-06, + "loss": 0.8383, + "num_input_tokens_seen": 15446672, + "step": 849 + }, + { + "epoch": 10.600311041990668, + "grad_norm": 1.2946388721466064, + "learning_rate": 6.52525352473905e-06, + "loss": 0.7268, + "num_input_tokens_seen": 15496064, + "step": 852 + }, + { + "epoch": 10.637636080870918, + "grad_norm": 1.3027721643447876, + "learning_rate": 6.269014643030213e-06, + "loss": 0.8733, + "num_input_tokens_seen": 15549952, + "step": 855 + }, + { + "epoch": 10.674961119751167, + "grad_norm": 1.340345025062561, + "learning_rate": 6.017571356669183e-06, + "loss": 0.8492, + "num_input_tokens_seen": 15607920, + "step": 858 + }, + { + "epoch": 10.712286158631414, + "grad_norm": 1.272009015083313, + "learning_rate": 5.770951239245803e-06, + "loss": 0.8154, + "num_input_tokens_seen": 15663280, + "step": 861 + }, + { + "epoch": 10.749611197511664, + "grad_norm": 1.3935860395431519, + "learning_rate": 5.529181335435124e-06, + "loss": 0.7953, + "num_input_tokens_seen": 15718496, + "step": 864 + }, + { + "epoch": 10.786936236391913, + "grad_norm": 1.3685379028320312, + "learning_rate": 5.292288158031594e-06, + "loss": 0.8559, + "num_input_tokens_seen": 15772560, + "step": 867 + }, + { + "epoch": 10.824261275272162, + "grad_norm": 1.5064653158187866, + "learning_rate": 5.060297685041659e-06, + "loss": 0.8111, + "num_input_tokens_seen": 15828464, + "step": 870 + }, + { + "epoch": 10.86158631415241, + "grad_norm": 1.5226088762283325, + "learning_rate": 4.833235356834959e-06, + "loss": 0.8946, + "num_input_tokens_seen": 15886160, + "step": 873 + }, + { + "epoch": 10.89891135303266, + "grad_norm": 1.6443517208099365, + "learning_rate": 4.611126073354571e-06, + "loss": 0.831, + "num_input_tokens_seen": 15938720, + "step": 876 + }, + { + "epoch": 10.936236391912908, + "grad_norm": 1.442195177078247, + "learning_rate": 4.3939941913863525e-06, + "loss": 0.8318, + "num_input_tokens_seen": 15992720, + "step": 879 + }, + { + "epoch": 10.973561430793158, + "grad_norm": 1.4696769714355469, + "learning_rate": 4.181863521888019e-06, + "loss": 0.7655, + "num_input_tokens_seen": 16049584, + "step": 882 + }, + { + "epoch": 11.010886469673405, + "grad_norm": 1.1691123247146606, + "learning_rate": 3.974757327377981e-06, + "loss": 0.8017, + "num_input_tokens_seen": 16103840, + "step": 885 + }, + { + "epoch": 11.048211508553655, + "grad_norm": 1.1555007696151733, + "learning_rate": 3.772698319384349e-06, + "loss": 0.8806, + "num_input_tokens_seen": 16160880, + "step": 888 + }, + { + "epoch": 11.085536547433904, + "grad_norm": 1.264450192451477, + "learning_rate": 3.575708655954324e-06, + "loss": 0.7852, + "num_input_tokens_seen": 16213280, + "step": 891 + }, + { + "epoch": 11.122861586314153, + "grad_norm": 1.1983911991119385, + "learning_rate": 3.3838099392243916e-06, + "loss": 0.7314, + "num_input_tokens_seen": 16265488, + "step": 894 + }, + { + "epoch": 11.1601866251944, + "grad_norm": 1.366458773612976, + "learning_rate": 3.197023213051337e-06, + "loss": 0.809, + "num_input_tokens_seen": 16321040, + "step": 897 + }, + { + "epoch": 11.19751166407465, + "grad_norm": 1.4491347074508667, + "learning_rate": 3.0153689607045845e-06, + "loss": 0.7076, + "num_input_tokens_seen": 16372560, + "step": 900 + }, + { + "epoch": 11.2348367029549, + "grad_norm": 1.481589674949646, + "learning_rate": 2.8388671026199522e-06, + "loss": 0.8021, + "num_input_tokens_seen": 16425856, + "step": 903 + }, + { + "epoch": 11.272161741835149, + "grad_norm": 1.3592849969863892, + "learning_rate": 2.667536994215186e-06, + "loss": 0.7633, + "num_input_tokens_seen": 16480176, + "step": 906 + }, + { + "epoch": 11.309486780715396, + "grad_norm": 1.3644659519195557, + "learning_rate": 2.501397423767382e-06, + "loss": 0.7514, + "num_input_tokens_seen": 16531248, + "step": 909 + }, + { + "epoch": 11.346811819595645, + "grad_norm": 1.3623534440994263, + "learning_rate": 2.340466610352654e-06, + "loss": 0.9144, + "num_input_tokens_seen": 16589952, + "step": 912 + }, + { + "epoch": 11.384136858475895, + "grad_norm": 1.3221518993377686, + "learning_rate": 2.1847622018482283e-06, + "loss": 0.7871, + "num_input_tokens_seen": 16642688, + "step": 915 + }, + { + "epoch": 11.421461897356144, + "grad_norm": 1.3355658054351807, + "learning_rate": 2.0343012729971243e-06, + "loss": 0.8018, + "num_input_tokens_seen": 16698016, + "step": 918 + }, + { + "epoch": 11.458786936236391, + "grad_norm": 1.3741073608398438, + "learning_rate": 1.8891003235357308e-06, + "loss": 0.7833, + "num_input_tokens_seen": 16751600, + "step": 921 + }, + { + "epoch": 11.49611197511664, + "grad_norm": 1.2644001245498657, + "learning_rate": 1.7491752763844293e-06, + "loss": 0.8172, + "num_input_tokens_seen": 16807424, + "step": 924 + }, + { + "epoch": 11.53343701399689, + "grad_norm": 1.3740483522415161, + "learning_rate": 1.6145414759014431e-06, + "loss": 0.7801, + "num_input_tokens_seen": 16863040, + "step": 927 + }, + { + "epoch": 11.57076205287714, + "grad_norm": 1.4756935834884644, + "learning_rate": 1.4852136862001764e-06, + "loss": 0.8497, + "num_input_tokens_seen": 16919056, + "step": 930 + }, + { + "epoch": 11.608087091757387, + "grad_norm": 1.253000020980835, + "learning_rate": 1.3612060895301759e-06, + "loss": 0.865, + "num_input_tokens_seen": 16976880, + "step": 933 + }, + { + "epoch": 11.645412130637636, + "grad_norm": 1.3113460540771484, + "learning_rate": 1.2425322847218368e-06, + "loss": 0.6891, + "num_input_tokens_seen": 17028016, + "step": 936 + }, + { + "epoch": 11.682737169517885, + "grad_norm": 1.2623776197433472, + "learning_rate": 1.1292052856952062e-06, + "loss": 0.8017, + "num_input_tokens_seen": 17084832, + "step": 939 + }, + { + "epoch": 11.720062208398133, + "grad_norm": 1.3193752765655518, + "learning_rate": 1.0212375200327973e-06, + "loss": 0.8494, + "num_input_tokens_seen": 17141568, + "step": 942 + }, + { + "epoch": 11.757387247278382, + "grad_norm": 1.3761273622512817, + "learning_rate": 9.186408276168013e-07, + "loss": 0.7922, + "num_input_tokens_seen": 17197456, + "step": 945 + }, + { + "epoch": 11.794712286158632, + "grad_norm": 1.3351306915283203, + "learning_rate": 8.214264593307098e-07, + "loss": 0.7013, + "num_input_tokens_seen": 17246704, + "step": 948 + }, + { + "epoch": 11.83203732503888, + "grad_norm": 1.3546366691589355, + "learning_rate": 7.296050758254957e-07, + "loss": 0.788, + "num_input_tokens_seen": 17301040, + "step": 951 + }, + { + "epoch": 11.869362363919128, + "grad_norm": 1.461629033088684, + "learning_rate": 6.431867463506048e-07, + "loss": 0.7436, + "num_input_tokens_seen": 17353760, + "step": 954 + }, + { + "epoch": 11.906687402799378, + "grad_norm": 1.3888185024261475, + "learning_rate": 5.621809476497098e-07, + "loss": 0.783, + "num_input_tokens_seen": 17408864, + "step": 957 + }, + { + "epoch": 11.944012441679627, + "grad_norm": 1.4601279497146606, + "learning_rate": 4.865965629214819e-07, + "loss": 0.8306, + "num_input_tokens_seen": 17463616, + "step": 960 + }, + { + "epoch": 11.981337480559876, + "grad_norm": 1.427171230316162, + "learning_rate": 4.1644188084548063e-07, + "loss": 0.9427, + "num_input_tokens_seen": 17521440, + "step": 963 + }, + { + "epoch": 12.018662519440124, + "grad_norm": 1.1976938247680664, + "learning_rate": 3.517245946731529e-07, + "loss": 0.8395, + "num_input_tokens_seen": 17578672, + "step": 966 + }, + { + "epoch": 12.055987558320373, + "grad_norm": 1.3307095766067505, + "learning_rate": 2.924518013842303e-07, + "loss": 0.8069, + "num_input_tokens_seen": 17635856, + "step": 969 + }, + { + "epoch": 12.093312597200622, + "grad_norm": 1.3354995250701904, + "learning_rate": 2.386300009084408e-07, + "loss": 0.7471, + "num_input_tokens_seen": 17689072, + "step": 972 + }, + { + "epoch": 12.130637636080872, + "grad_norm": 1.1849631071090698, + "learning_rate": 1.9026509541272275e-07, + "loss": 0.8251, + "num_input_tokens_seen": 17746800, + "step": 975 + }, + { + "epoch": 12.16796267496112, + "grad_norm": 1.19794762134552, + "learning_rate": 1.4736238865398765e-07, + "loss": 0.7807, + "num_input_tokens_seen": 17799264, + "step": 978 + }, + { + "epoch": 12.205287713841368, + "grad_norm": 1.3394030332565308, + "learning_rate": 1.0992658539750178e-07, + "loss": 0.8495, + "num_input_tokens_seen": 17857296, + "step": 981 + }, + { + "epoch": 12.242612752721618, + "grad_norm": 1.3379496335983276, + "learning_rate": 7.796179090094891e-08, + "loss": 0.7786, + "num_input_tokens_seen": 17911456, + "step": 984 + }, + { + "epoch": 12.279937791601867, + "grad_norm": 1.3171231746673584, + "learning_rate": 5.1471510464268236e-08, + "loss": 0.7214, + "num_input_tokens_seen": 17960800, + "step": 987 + }, + { + "epoch": 12.317262830482115, + "grad_norm": 1.3726296424865723, + "learning_rate": 3.04586490452119e-08, + "loss": 0.7353, + "num_input_tokens_seen": 18014816, + "step": 990 + }, + { + "epoch": 12.354587869362364, + "grad_norm": 1.3374559879302979, + "learning_rate": 1.4925510940844156e-08, + "loss": 0.7827, + "num_input_tokens_seen": 18069152, + "step": 993 + }, + { + "epoch": 12.391912908242613, + "grad_norm": 1.2918580770492554, + "learning_rate": 4.873799534788059e-09, + "loss": 0.871, + "num_input_tokens_seen": 18124176, + "step": 996 + }, + { + "epoch": 12.42923794712286, + "grad_norm": 1.5582739114761353, + "learning_rate": 3.0461711048035415e-10, + "loss": 0.8343, + "num_input_tokens_seen": 18177392, + "step": 999 + }, + { + "epoch": 12.441679626749611, + "eval_loss": 1.8893193006515503, + "eval_runtime": 10.0832, + "eval_samples_per_second": 99.175, + "eval_steps_per_second": 7.141, + "num_input_tokens_seen": 18194112, + "step": 1000 + }, + { + "epoch": 12.441679626749611, + "num_input_tokens_seen": 18194112, + "step": 1000, + "total_flos": 6.541222595474227e+17, + "train_loss": 1.1796256858706475, + "train_runtime": 3246.117, + "train_samples_per_second": 34.503, + "train_steps_per_second": 0.308 + } + ], + "logging_steps": 3, + "max_steps": 1000, + "num_input_tokens_seen": 18194112, + "num_train_epochs": 13, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.541222595474227e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/training_args.bin b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/training_args.bin new file mode 100644 index 00000000..514636e8 Binary files /dev/null and b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/training_args.bin differ diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/training_eval_loss.png b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/training_eval_loss.png new file mode 100644 index 00000000..9207dfbe Binary files /dev/null and b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/training_eval_loss.png differ diff --git a/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/training_loss.png b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/training_loss.png new file mode 100644 index 00000000..907085bd Binary files /dev/null and b/results/lora_sft/ChatGLM2-6B/ChatGLM2_lora_sft_1/training_loss.png differ