From fa9a9007f9d4fad0a5c404f85463bea43653c6c1 Mon Sep 17 00:00:00 2001 From: wql Date: Wed, 4 Sep 2024 16:52:15 +0800 Subject: [PATCH 01/25] chore: add lora sft and predict template yaml file --- results/lora_sft_template.yaml | 42 ++++++++++++++++++++++++++++++++++ results/predict_template.yaml | 23 +++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 results/lora_sft_template.yaml create mode 100644 results/predict_template.yaml diff --git a/results/lora_sft_template.yaml b/results/lora_sft_template.yaml new file mode 100644 index 00000000..a3b42642 --- /dev/null +++ b/results/lora_sft_template.yaml @@ -0,0 +1,42 @@ +### model +model_name_or_path: ../../llm/baichuan + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_target: all + +### dataset +dataset: belle_1m +template: baichuan +cutoff_len: 1024 +max_samples: 10000 +overwrite_cache: true +preprocessing_num_workers: 16 + +### output +output_dir: ./results/lora_sft_2/Baichuan2-7B/Baichuan2_lora_sft_1_single_step500 +logging_steps: 3 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true + +### train +per_device_train_batch_size: 2 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-4 +num_train_epochs: 10.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +bf16: true +ddp_timeout: 180000000 +max_steps: 500 +include_num_input_tokens_seen: true +include_tokens_per_second: true + +### eval +val_size: 0.1 +per_device_eval_batch_size: 2 +eval_strategy: steps +eval_steps: 500 \ No newline at end of file diff --git a/results/predict_template.yaml b/results/predict_template.yaml new file mode 100644 index 00000000..bafb3f5f --- /dev/null +++ b/results/predict_template.yaml @@ -0,0 +1,23 @@ +### model +model_name_or_path: ../../llm/baichuan + +### method +do_predict: true + +### dataset +eval_dataset: alpaca_gpt4_zh +template: baichuan +cutoff_len: 1024 +max_samples: 50 +overwrite_cache: true +preprocessing_num_workers: 16 +include_tokens_per_second: true + +### output +output_dir: ./results/inference/Baichuan2-7B/Baichuan2_predict_1 +overwrite_output_dir: true + +### eval +per_device_eval_batch_size: 2 +predict_with_generate: true +ddp_timeout: 180000000 From 3e548489ed32a96688541dc37b458204615dea97 Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 11:28:19 +0800 Subject: [PATCH 02/25] feat: done easy run --- batch_run.sh | 1 + gpu_status.py | 26 ++++++++++++++++------ prepare_yaml_file.py | 52 +++++++++++++++++++++++++++++++++++++++++++ run_once.sh | 53 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 125 insertions(+), 7 deletions(-) create mode 100644 batch_run.sh create mode 100644 prepare_yaml_file.py create mode 100644 run_once.sh diff --git a/batch_run.sh b/batch_run.sh new file mode 100644 index 00000000..1f3329e1 --- /dev/null +++ b/batch_run.sh @@ -0,0 +1 @@ +bash run_once.sh lora_sft Baichuan-7B 4 50 diff --git a/gpu_status.py b/gpu_status.py index 9f89f519..f87d2e9c 100644 --- a/gpu_status.py +++ b/gpu_status.py @@ -1,16 +1,17 @@ import json +import sys import pynvml import time import psutil +UNIT = 1024 * 1024 * 1024 -def main(): - UNIT = 1024 * 1024 * 1024 - +def gpu_status(output_path = "./results/gpu_status", print_status = False): pynvml.nvmlInit() gpuDeviceCount = pynvml.nvmlDeviceGetCount() start_time = time.time() - + first_loop = True + while time.time() - start_time < 3600 *24: # print(time.time() - start_time) all_gpu_status = [] @@ -43,14 +44,25 @@ def main(): all_gpu_status = all_gpu_status, all_processes_status = all_processes_status ) - formatted_time = time.strftime('%Y%m%d%H%M%S', time.localtime()) - with open(f"./results/gpu_status/gpu_status_{formatted_time}.json", "a", encoding="utf-8") as f: + + with open(f"{output_path}/gpu_status.json", "a", encoding="utf-8") as f: f.write(json.dumps(logs) + "\n") - print(logs) + + if first_loop: + print("Start run gpu_status.py") + first_loop = False + + if print_status: + print(logs) time.sleep(60) + pynvml.nvmlShutdown() +def main(): + output_path = sys.argv[1] + print_status = sys.argv[2] + gpu_status(output_path, print_status) if __name__ == "__main__": main() diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py new file mode 100644 index 00000000..12dae8c5 --- /dev/null +++ b/prepare_yaml_file.py @@ -0,0 +1,52 @@ +import os +import sys +import time +import yaml +import json +import pynvml +import time +import psutil + +def main(): + run_type = sys.argv[1] + model = sys.argv[2] + max_steps = sys.argv[3] + run_name = sys.argv[4] + output_dir = sys.argv[5] + + if run_type == "lora_sft": + yaml_file = './results/lora_sft_template.yml' + elif run_type == "inference": + yaml_file = './results/predict_template.yml' + + if model == "9g-8B": + model_name_or_path = "../../models/sft_8b_v2" + template = "" + elif model == "Baichuan2-7B": + model_name_or_path = "../../models/Baichuan-7B" + template = "baichuan" + elif model == "ChatGLM2-6B": + model_name_or_path = "../../models/chatglm2-6b" + template = "chatglm2" + elif model == "Llama2-7B": + model_name_or_path = "../../models/llama-2-7b-ms" + template = "llama2" + elif model == "Qwen-7B": + model_name_or_path = "../../models/Qwen-7B" + template = "qwen" + + config = None + with open(yaml_file, 'r', encoding='utf-8') as f: + config = yaml.load(f.read(), Loader=yaml.FullLoader) + + config['model_name_or_path'] = model_name_or_path + config['template'] = template + config['output_dir'] = output_dir + if run_type == "lora_sft": + config['max_steps'] = max_steps + + with open(f'{output_dir}/{run_name}.yml', 'w', encoding='utf-8') as f: + yaml.dump(data=config, stream=f, allow_unicode=True) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/run_once.sh b/run_once.sh new file mode 100644 index 00000000..0d553c38 --- /dev/null +++ b/run_once.sh @@ -0,0 +1,53 @@ +run_type = $1 +model = $2 +gpu_cnt = $3 +max_steps = $4 + +current_datetime=$(date +%Y%m%d%H%M%S) + +if [ "${run_type}" = "lora_sft" ]; then + run_name="${run_type}_${model}_${gpu_cnt}_gpu_${max_steps}_step_${current_datetime}" + +else + run_name="${run_type}_${model}_${gpu_cnt}_gpu_${current_datetime}" +fi + +output_dir ="./results/${run_name}" + +if [ ! -d "$output_dir" ]; then + mkdir -p "$output_dir" + echo "路径不存在,已创建: $output_dir" +else + echo "路径已存在: $output_dir" +fi + +echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}" +python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir} + + + +# export USE_MODELSCOPE_HUB=1 + +# # 0 means not printing gpu status +# python gpu_status.py ${output_dir} 0 & +# gpu_status_pid=$! +# echo "Start recording gpu status " + + +# if [ "${gpu_cnt}" = "1" ]; then +# ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml \ +# | tee ${output_dir}/log.txt" & +# train_pid=$! +# echo "Start train" +# else +# FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yml \ +# | tee ${output_dir}/log.txt" & +# train_pid=$! +# echo "Start train" +# fi + +# wait $train_pid +# echo "Train ended" +# sleep 90 +# kill $gpu_status_pid +# echo "Gpu status ended" \ No newline at end of file From ae308991fbb10b65958228c4785fef30de7f2e63 Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 12:54:15 +0800 Subject: [PATCH 03/25] fix: fix first line --- run_once.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/run_once.sh b/run_once.sh index 0d553c38..de3dd6e7 100644 --- a/run_once.sh +++ b/run_once.sh @@ -1,3 +1,5 @@ +#!/bin/bash + run_type = $1 model = $2 gpu_cnt = $3 From 0cf37e5ec18bfd5df3fcbfee3f97d76a3345f5a2 Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 12:57:41 +0800 Subject: [PATCH 04/25] fix: fix para --- run_once.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/run_once.sh b/run_once.sh index de3dd6e7..09786827 100644 --- a/run_once.sh +++ b/run_once.sh @@ -1,9 +1,9 @@ #!/bin/bash -run_type = $1 -model = $2 -gpu_cnt = $3 -max_steps = $4 +run_type = "$1" +model = "$2" +gpu_cnt = "$3" +max_steps = "$4" current_datetime=$(date +%Y%m%d%H%M%S) From f23e9d417ee1cfcb9c2383c3e14cae7fa39cb5df Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 12:59:02 +0800 Subject: [PATCH 05/25] fix: fix space --- run_once.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/run_once.sh b/run_once.sh index 09786827..7badc3fe 100644 --- a/run_once.sh +++ b/run_once.sh @@ -1,13 +1,13 @@ #!/bin/bash -run_type = "$1" -model = "$2" -gpu_cnt = "$3" -max_steps = "$4" +run_type="$1" +model="$2" +gpu_cnt="$3" +max_steps="$4" current_datetime=$(date +%Y%m%d%H%M%S) -if [ "${run_type}" = "lora_sft" ]; then +if [ "${run_type}"="lora_sft" ]; then run_name="${run_type}_${model}_${gpu_cnt}_gpu_${max_steps}_step_${current_datetime}" else @@ -36,7 +36,7 @@ python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${outp # echo "Start recording gpu status " -# if [ "${gpu_cnt}" = "1" ]; then +# if [ "${gpu_cnt}"="1" ]; then # ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml \ # | tee ${output_dir}/log.txt" & # train_pid=$! From 846fb7bfef830bf5e4477e826b06ae82e11870f8 Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 12:59:52 +0800 Subject: [PATCH 06/25] fix: fix space --- run_once.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_once.sh b/run_once.sh index 7badc3fe..d084847a 100644 --- a/run_once.sh +++ b/run_once.sh @@ -14,7 +14,7 @@ else run_name="${run_type}_${model}_${gpu_cnt}_gpu_${current_datetime}" fi -output_dir ="./results/${run_name}" +output_dir="./results/${run_name}" if [ ! -d "$output_dir" ]; then mkdir -p "$output_dir" From 4058fd7d6487266da73c2e55fdf67b3964ac0052 Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 13:01:49 +0800 Subject: [PATCH 07/25] fix: remove no use import --- prepare_yaml_file.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py index 12dae8c5..2be0451d 100644 --- a/prepare_yaml_file.py +++ b/prepare_yaml_file.py @@ -1,11 +1,5 @@ -import os import sys -import time import yaml -import json -import pynvml -import time -import psutil def main(): run_type = sys.argv[1] From cc99691cf4fbd5f16ac63ace88875207dbcc9a98 Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 13:03:22 +0800 Subject: [PATCH 08/25] fix: fix file type --- prepare_yaml_file.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py index 2be0451d..10931229 100644 --- a/prepare_yaml_file.py +++ b/prepare_yaml_file.py @@ -9,9 +9,9 @@ def main(): output_dir = sys.argv[5] if run_type == "lora_sft": - yaml_file = './results/lora_sft_template.yml' + yaml_file = './results/lora_sft_template.yaml' elif run_type == "inference": - yaml_file = './results/predict_template.yml' + yaml_file = './results/predict_template.yaml' if model == "9g-8B": model_name_or_path = "../../models/sft_8b_v2" From ceb01459feb5cbe53b4c4465ce29c21b1742d760 Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 13:04:54 +0800 Subject: [PATCH 09/25] fix: fix bug --- prepare_yaml_file.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py index 10931229..4c864328 100644 --- a/prepare_yaml_file.py +++ b/prepare_yaml_file.py @@ -12,7 +12,9 @@ def main(): yaml_file = './results/lora_sft_template.yaml' elif run_type == "inference": yaml_file = './results/predict_template.yaml' - + + model_name_or_path = "" + template = "" if model == "9g-8B": model_name_or_path = "../../models/sft_8b_v2" template = "" From 95b4b493e6a6b30dbd14b608ad16c81b6490bf94 Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 13:09:43 +0800 Subject: [PATCH 10/25] chore: add echo --- prepare_yaml_file.py | 2 +- run_once.sh | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py index 4c864328..9ea58734 100644 --- a/prepare_yaml_file.py +++ b/prepare_yaml_file.py @@ -41,7 +41,7 @@ def main(): if run_type == "lora_sft": config['max_steps'] = max_steps - with open(f'{output_dir}/{run_name}.yml', 'w', encoding='utf-8') as f: + with open(f'{output_dir}/{run_name}.yaml', 'w', encoding='utf-8') as f: yaml.dump(data=config, stream=f, allow_unicode=True) if __name__ == "__main__": diff --git a/run_once.sh b/run_once.sh index d084847a..fdef74a1 100644 --- a/run_once.sh +++ b/run_once.sh @@ -18,14 +18,15 @@ output_dir="./results/${run_name}" if [ ! -d "$output_dir" ]; then mkdir -p "$output_dir" - echo "路径不存在,已创建: $output_dir" + echo "output_dir created: $output_dir" else - echo "路径已存在: $output_dir" + echo "output_dir exists: $output_dir" fi -echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}" -python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir} +# echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}" +python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir} +echo "yaml file save to {output_dir}/{run_name}.yaml" # export USE_MODELSCOPE_HUB=1 From 8162a54aa584155c237ba7833d11898ea32f6e56 Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 13:10:34 +0800 Subject: [PATCH 11/25] fix:small fix --- run_once.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_once.sh b/run_once.sh index fdef74a1..0df70c36 100644 --- a/run_once.sh +++ b/run_once.sh @@ -26,7 +26,7 @@ fi # echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}" python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir} -echo "yaml file save to {output_dir}/{run_name}.yaml" +echo "yaml file save to ${output_dir}/${run_name}.yaml" # export USE_MODELSCOPE_HUB=1 From 64044380bd45ecca457284dc5ddcbcf08c635a5d Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 13:21:38 +0800 Subject: [PATCH 12/25] fix: add not supported model err msg --- batch_run.sh | 2 +- prepare_yaml_file.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/batch_run.sh b/batch_run.sh index 1f3329e1..fd9d5f59 100644 --- a/batch_run.sh +++ b/batch_run.sh @@ -1 +1 @@ -bash run_once.sh lora_sft Baichuan-7B 4 50 +bash run_once.sh lora_sft Qwen-7B 4 50 diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py index 9ea58734..69cb300e 100644 --- a/prepare_yaml_file.py +++ b/prepare_yaml_file.py @@ -19,8 +19,8 @@ def main(): model_name_or_path = "../../models/sft_8b_v2" template = "" elif model == "Baichuan2-7B": - model_name_or_path = "../../models/Baichuan-7B" - template = "baichuan" + model_name_or_path = "../../models/Baichuan2-7B" + template = "baichuan2" elif model == "ChatGLM2-6B": model_name_or_path = "../../models/chatglm2-6b" template = "chatglm2" @@ -30,11 +30,15 @@ def main(): elif model == "Qwen-7B": model_name_or_path = "../../models/Qwen-7B" template = "qwen" + else: + print("ERROR: model not supported.") + sys.exit() config = None with open(yaml_file, 'r', encoding='utf-8') as f: config = yaml.load(f.read(), Loader=yaml.FullLoader) - + + config['model_name_or_path'] = model_name_or_path config['template'] = template config['output_dir'] = output_dir From 190fddf27d94e892b1843cdd241493ec30674c1d Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 13:37:17 +0800 Subject: [PATCH 13/25] fix: small change --- gpu_status.py | 7 ++++--- prepare_yaml_file.py | 2 ++ run_once.sh | 19 ++++++++++--------- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/gpu_status.py b/gpu_status.py index f87d2e9c..8b1a0e5c 100644 --- a/gpu_status.py +++ b/gpu_status.py @@ -6,7 +6,7 @@ import psutil UNIT = 1024 * 1024 * 1024 -def gpu_status(output_path = "./results/gpu_status", print_status = False): +def gpu_status(output_path = "./results/gpu_status", print_status = False, sleep_time = 60): pynvml.nvmlInit() gpuDeviceCount = pynvml.nvmlDeviceGetCount() start_time = time.time() @@ -55,14 +55,15 @@ def gpu_status(output_path = "./results/gpu_status", print_status = False): if print_status: print(logs) - time.sleep(60) + time.sleep(sleep_time) pynvml.nvmlShutdown() def main(): output_path = sys.argv[1] print_status = sys.argv[2] - gpu_status(output_path, print_status) + sleep_time = sys.argv[3] + gpu_status(output_path, print_status, sleep_time) if __name__ == "__main__": main() diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py index 69cb300e..f5b8abc0 100644 --- a/prepare_yaml_file.py +++ b/prepare_yaml_file.py @@ -47,6 +47,8 @@ def main(): with open(f'{output_dir}/{run_name}.yaml', 'w', encoding='utf-8') as f: yaml.dump(data=config, stream=f, allow_unicode=True) + + print(f"yaml file saved to {output_dir}/{run_name}.yaml") if __name__ == "__main__": main() \ No newline at end of file diff --git a/run_once.sh b/run_once.sh index 0df70c36..bd8b5d87 100644 --- a/run_once.sh +++ b/run_once.sh @@ -26,16 +26,16 @@ fi # echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}" python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir} -echo "yaml file save to ${output_dir}/${run_name}.yaml" +export USE_MODELSCOPE_HUB=1 -# export USE_MODELSCOPE_HUB=1 - -# # 0 means not printing gpu status -# python gpu_status.py ${output_dir} 0 & -# gpu_status_pid=$! -# echo "Start recording gpu status " +echo "Start recording gpu status " +# 0 means not printing gpu status +python gpu_status.py ${output_dir} 1 10 & +gpu_status_pid=$! +echo "${gpu_status_pid}" +sleep 60 # if [ "${gpu_cnt}"="1" ]; then # ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml \ @@ -52,5 +52,6 @@ echo "yaml file save to ${output_dir}/${run_name}.yaml" # wait $train_pid # echo "Train ended" # sleep 90 -# kill $gpu_status_pid -# echo "Gpu status ended" \ No newline at end of file + +kill $gpu_status_pid +echo "Gpu status ended" \ No newline at end of file From 36840f031033aac24176662594dac7488627614d Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 14:35:47 +0800 Subject: [PATCH 14/25] fix: fix baichuan2 path --- prepare_yaml_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py index f5b8abc0..36357f6b 100644 --- a/prepare_yaml_file.py +++ b/prepare_yaml_file.py @@ -19,7 +19,7 @@ def main(): model_name_or_path = "../../models/sft_8b_v2" template = "" elif model == "Baichuan2-7B": - model_name_or_path = "../../models/Baichuan2-7B" + model_name_or_path = "../../models/Baichuan2-7B-Base" template = "baichuan2" elif model == "ChatGLM2-6B": model_name_or_path = "../../models/chatglm2-6b" From ba90bf16255768b7ed097dd9c64085c25e05631c Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 14:41:48 +0800 Subject: [PATCH 15/25] chore: sort run_once --- run_once.sh | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/run_once.sh b/run_once.sh index bd8b5d87..2c93c026 100644 --- a/run_once.sh +++ b/run_once.sh @@ -23,35 +23,32 @@ else echo "output_dir exists: $output_dir" fi - # echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}" python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir} export USE_MODELSCOPE_HUB=1 -echo "Start recording gpu status " -# 0 means not printing gpu status -python gpu_status.py ${output_dir} 1 10 & -gpu_status_pid=$! -echo "${gpu_status_pid}" +# echo "Start recording gpu status " +# # 0 means not printing gpu status +# python gpu_status.py ${output_dir} 1 10 & +# gpu_status_pid=$! +# echo "${gpu_status_pid}" -sleep 60 +if [ "${gpu_cnt}"="1" ]; then + ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml \ + | tee ${output_dir}/log.txt" & + train_pid=$! + echo "Start train" +else + FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yml \ + | tee ${output_dir}/log.txt" & + train_pid=$! + echo "Start train" +fi -# if [ "${gpu_cnt}"="1" ]; then -# ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml \ -# | tee ${output_dir}/log.txt" & -# train_pid=$! -# echo "Start train" -# else -# FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yml \ -# | tee ${output_dir}/log.txt" & -# train_pid=$! -# echo "Start train" -# fi +wait $train_pid +echo "Train ended" -# wait $train_pid -# echo "Train ended" # sleep 90 - -kill $gpu_status_pid -echo "Gpu status ended" \ No newline at end of file +# kill $gpu_status_pid +# echo "Gpu status ended" \ No newline at end of file From f71f62f2f6bc6ec29bedb7aded36d9991d5de60c Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 14:47:17 +0800 Subject: [PATCH 16/25] fix: fix typo --- run_once.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/run_once.sh b/run_once.sh index 2c93c026..cf13de0b 100644 --- a/run_once.sh +++ b/run_once.sh @@ -35,13 +35,11 @@ export USE_MODELSCOPE_HUB=1 # echo "${gpu_status_pid}" if [ "${gpu_cnt}"="1" ]; then - ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml \ - | tee ${output_dir}/log.txt" & + ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml | tee "${output_dir}/log.txt" & train_pid=$! echo "Start train" else - FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yml \ - | tee ${output_dir}/log.txt" & + FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yml | tee "${output_dir}/log.txt" & train_pid=$! echo "Start train" fi From ab4bf8bd4dc6b6bf5c10f4d32a61cb9b9dc935ac Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 06:52:33 +0000 Subject: [PATCH 17/25] add: add all test results --- ...ichuan-7B_4_gpu_50_step_20240905050501.yml | 31 +++++++++++++++++++ ...chuan-7B_4_gpu_50_step_20240905050958.yaml | 31 +++++++++++++++++++ ...chuan-7B_4_gpu_50_step_20240905051039.yaml | 31 +++++++++++++++++++ ..._Qwen-7B_4_gpu_50_step_20240905052241.yaml | 31 +++++++++++++++++++ ..._Qwen-7B_4_gpu_50_step_20240905053758.yaml | 31 +++++++++++++++++++ ...g.txt &\n train_pid=1720\n echo Start" | 0 ..._Qwen-7B_4_gpu_50_step_20240905064243.yaml | 31 +++++++++++++++++++ .../log.txt | 0 ..._Qwen-7B_4_gpu_50_step_20240905064736.yaml | 31 +++++++++++++++++++ 9 files changed, 217 insertions(+) create mode 100644 results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml create mode 100644 results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml create mode 100644 results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml create mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml create mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml create mode 100644 "results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/log.txt &\n train_pid=1720\n echo Start" create mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml create mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/log.txt create mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml diff --git a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml new file mode 100644 index 00000000..26507813 --- /dev/null +++ b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml @@ -0,0 +1,31 @@ +bf16: true +cutoff_len: 1024 +dataset: belle_1m +ddp_timeout: 180000000 +do_train: true +eval_steps: 500 +eval_strategy: steps +finetuning_type: lora +gradient_accumulation_steps: 8 +include_num_input_tokens_seen: true +include_tokens_per_second: true +learning_rate: 0.0001 +logging_steps: 3 +lora_target: all +lr_scheduler_type: cosine +max_samples: 10000 +max_steps: '50' +model_name_or_path: '' +num_train_epochs: 10.0 +output_dir: ./results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501 +overwrite_cache: true +overwrite_output_dir: true +per_device_eval_batch_size: 2 +per_device_train_batch_size: 2 +plot_loss: true +preprocessing_num_workers: 16 +save_steps: 500 +stage: sft +template: '' +val_size: 0.1 +warmup_ratio: 0.1 diff --git a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml new file mode 100644 index 00000000..e041b60b --- /dev/null +++ b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml @@ -0,0 +1,31 @@ +bf16: true +cutoff_len: 1024 +dataset: belle_1m +ddp_timeout: 180000000 +do_train: true +eval_steps: 500 +eval_strategy: steps +finetuning_type: lora +gradient_accumulation_steps: 8 +include_num_input_tokens_seen: true +include_tokens_per_second: true +learning_rate: 0.0001 +logging_steps: 3 +lora_target: all +lr_scheduler_type: cosine +max_samples: 10000 +max_steps: '50' +model_name_or_path: '' +num_train_epochs: 10.0 +output_dir: ./results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958 +overwrite_cache: true +overwrite_output_dir: true +per_device_eval_batch_size: 2 +per_device_train_batch_size: 2 +plot_loss: true +preprocessing_num_workers: 16 +save_steps: 500 +stage: sft +template: '' +val_size: 0.1 +warmup_ratio: 0.1 diff --git a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml new file mode 100644 index 00000000..84e13b18 --- /dev/null +++ b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml @@ -0,0 +1,31 @@ +bf16: true +cutoff_len: 1024 +dataset: belle_1m +ddp_timeout: 180000000 +do_train: true +eval_steps: 500 +eval_strategy: steps +finetuning_type: lora +gradient_accumulation_steps: 8 +include_num_input_tokens_seen: true +include_tokens_per_second: true +learning_rate: 0.0001 +logging_steps: 3 +lora_target: all +lr_scheduler_type: cosine +max_samples: 10000 +max_steps: '50' +model_name_or_path: '' +num_train_epochs: 10.0 +output_dir: ./results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039 +overwrite_cache: true +overwrite_output_dir: true +per_device_eval_batch_size: 2 +per_device_train_batch_size: 2 +plot_loss: true +preprocessing_num_workers: 16 +save_steps: 500 +stage: sft +template: '' +val_size: 0.1 +warmup_ratio: 0.1 diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml new file mode 100644 index 00000000..2a1de0fe --- /dev/null +++ b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml @@ -0,0 +1,31 @@ +bf16: true +cutoff_len: 1024 +dataset: belle_1m +ddp_timeout: 180000000 +do_train: true +eval_steps: 500 +eval_strategy: steps +finetuning_type: lora +gradient_accumulation_steps: 8 +include_num_input_tokens_seen: true +include_tokens_per_second: true +learning_rate: 0.0001 +logging_steps: 3 +lora_target: all +lr_scheduler_type: cosine +max_samples: 10000 +max_steps: '50' +model_name_or_path: ../../models/Qwen-7B +num_train_epochs: 10.0 +output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241 +overwrite_cache: true +overwrite_output_dir: true +per_device_eval_batch_size: 2 +per_device_train_batch_size: 2 +plot_loss: true +preprocessing_num_workers: 16 +save_steps: 500 +stage: sft +template: qwen +val_size: 0.1 +warmup_ratio: 0.1 diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml new file mode 100644 index 00000000..caa1505f --- /dev/null +++ b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml @@ -0,0 +1,31 @@ +bf16: true +cutoff_len: 1024 +dataset: belle_1m +ddp_timeout: 180000000 +do_train: true +eval_steps: 500 +eval_strategy: steps +finetuning_type: lora +gradient_accumulation_steps: 8 +include_num_input_tokens_seen: true +include_tokens_per_second: true +learning_rate: 0.0001 +logging_steps: 3 +lora_target: all +lr_scheduler_type: cosine +max_samples: 10000 +max_steps: '50' +model_name_or_path: ../../models/Qwen-7B +num_train_epochs: 10.0 +output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758 +overwrite_cache: true +overwrite_output_dir: true +per_device_eval_batch_size: 2 +per_device_train_batch_size: 2 +plot_loss: true +preprocessing_num_workers: 16 +save_steps: 500 +stage: sft +template: qwen +val_size: 0.1 +warmup_ratio: 0.1 diff --git "a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/log.txt &\n train_pid=1720\n echo Start" "b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/log.txt &\n train_pid=1720\n echo Start" new file mode 100644 index 00000000..e69de29b diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml new file mode 100644 index 00000000..4631b614 --- /dev/null +++ b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml @@ -0,0 +1,31 @@ +bf16: true +cutoff_len: 1024 +dataset: belle_1m +ddp_timeout: 180000000 +do_train: true +eval_steps: 500 +eval_strategy: steps +finetuning_type: lora +gradient_accumulation_steps: 8 +include_num_input_tokens_seen: true +include_tokens_per_second: true +learning_rate: 0.0001 +logging_steps: 3 +lora_target: all +lr_scheduler_type: cosine +max_samples: 10000 +max_steps: '50' +model_name_or_path: ../../models/Qwen-7B +num_train_epochs: 10.0 +output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243 +overwrite_cache: true +overwrite_output_dir: true +per_device_eval_batch_size: 2 +per_device_train_batch_size: 2 +plot_loss: true +preprocessing_num_workers: 16 +save_steps: 500 +stage: sft +template: qwen +val_size: 0.1 +warmup_ratio: 0.1 diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/log.txt b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/log.txt new file mode 100644 index 00000000..e69de29b diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml new file mode 100644 index 00000000..fe61b1e4 --- /dev/null +++ b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml @@ -0,0 +1,31 @@ +bf16: true +cutoff_len: 1024 +dataset: belle_1m +ddp_timeout: 180000000 +do_train: true +eval_steps: 500 +eval_strategy: steps +finetuning_type: lora +gradient_accumulation_steps: 8 +include_num_input_tokens_seen: true +include_tokens_per_second: true +learning_rate: 0.0001 +logging_steps: 3 +lora_target: all +lr_scheduler_type: cosine +max_samples: 10000 +max_steps: '50' +model_name_or_path: ../../models/Qwen-7B +num_train_epochs: 10.0 +output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736 +overwrite_cache: true +overwrite_output_dir: true +per_device_eval_batch_size: 2 +per_device_train_batch_size: 2 +plot_loss: true +preprocessing_num_workers: 16 +save_steps: 500 +stage: sft +template: qwen +val_size: 0.1 +warmup_ratio: 0.1 From c6a4d43c068cc6e1f680e7c2f08603800b598039 Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 07:05:47 +0000 Subject: [PATCH 18/25] fix: remove no need test file --- ...ichuan-7B_4_gpu_50_step_20240905050501.yml | 31 ------------------- ...chuan-7B_4_gpu_50_step_20240905050958.yaml | 31 ------------------- ...chuan-7B_4_gpu_50_step_20240905051039.yaml | 31 ------------------- ..._Qwen-7B_4_gpu_50_step_20240905052241.yaml | 31 ------------------- ..._Qwen-7B_4_gpu_50_step_20240905053758.yaml | 31 ------------------- ...g.txt &\n train_pid=1720\n echo Start" | 0 ..._Qwen-7B_4_gpu_50_step_20240905064243.yaml | 31 ------------------- .../log.txt | 0 ..._Qwen-7B_4_gpu_50_step_20240905064736.yaml | 31 ------------------- 9 files changed, 217 deletions(-) delete mode 100644 results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml delete mode 100644 results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml delete mode 100644 results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml delete mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml delete mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml delete mode 100644 "results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/log.txt &\n train_pid=1720\n echo Start" delete mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml delete mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/log.txt delete mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml diff --git a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml deleted file mode 100644 index 26507813..00000000 --- a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml +++ /dev/null @@ -1,31 +0,0 @@ -bf16: true -cutoff_len: 1024 -dataset: belle_1m -ddp_timeout: 180000000 -do_train: true -eval_steps: 500 -eval_strategy: steps -finetuning_type: lora -gradient_accumulation_steps: 8 -include_num_input_tokens_seen: true -include_tokens_per_second: true -learning_rate: 0.0001 -logging_steps: 3 -lora_target: all -lr_scheduler_type: cosine -max_samples: 10000 -max_steps: '50' -model_name_or_path: '' -num_train_epochs: 10.0 -output_dir: ./results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501 -overwrite_cache: true -overwrite_output_dir: true -per_device_eval_batch_size: 2 -per_device_train_batch_size: 2 -plot_loss: true -preprocessing_num_workers: 16 -save_steps: 500 -stage: sft -template: '' -val_size: 0.1 -warmup_ratio: 0.1 diff --git a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml deleted file mode 100644 index e041b60b..00000000 --- a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml +++ /dev/null @@ -1,31 +0,0 @@ -bf16: true -cutoff_len: 1024 -dataset: belle_1m -ddp_timeout: 180000000 -do_train: true -eval_steps: 500 -eval_strategy: steps -finetuning_type: lora -gradient_accumulation_steps: 8 -include_num_input_tokens_seen: true -include_tokens_per_second: true -learning_rate: 0.0001 -logging_steps: 3 -lora_target: all -lr_scheduler_type: cosine -max_samples: 10000 -max_steps: '50' -model_name_or_path: '' -num_train_epochs: 10.0 -output_dir: ./results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958 -overwrite_cache: true -overwrite_output_dir: true -per_device_eval_batch_size: 2 -per_device_train_batch_size: 2 -plot_loss: true -preprocessing_num_workers: 16 -save_steps: 500 -stage: sft -template: '' -val_size: 0.1 -warmup_ratio: 0.1 diff --git a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml deleted file mode 100644 index 84e13b18..00000000 --- a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml +++ /dev/null @@ -1,31 +0,0 @@ -bf16: true -cutoff_len: 1024 -dataset: belle_1m -ddp_timeout: 180000000 -do_train: true -eval_steps: 500 -eval_strategy: steps -finetuning_type: lora -gradient_accumulation_steps: 8 -include_num_input_tokens_seen: true -include_tokens_per_second: true -learning_rate: 0.0001 -logging_steps: 3 -lora_target: all -lr_scheduler_type: cosine -max_samples: 10000 -max_steps: '50' -model_name_or_path: '' -num_train_epochs: 10.0 -output_dir: ./results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039 -overwrite_cache: true -overwrite_output_dir: true -per_device_eval_batch_size: 2 -per_device_train_batch_size: 2 -plot_loss: true -preprocessing_num_workers: 16 -save_steps: 500 -stage: sft -template: '' -val_size: 0.1 -warmup_ratio: 0.1 diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml deleted file mode 100644 index 2a1de0fe..00000000 --- a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml +++ /dev/null @@ -1,31 +0,0 @@ -bf16: true -cutoff_len: 1024 -dataset: belle_1m -ddp_timeout: 180000000 -do_train: true -eval_steps: 500 -eval_strategy: steps -finetuning_type: lora -gradient_accumulation_steps: 8 -include_num_input_tokens_seen: true -include_tokens_per_second: true -learning_rate: 0.0001 -logging_steps: 3 -lora_target: all -lr_scheduler_type: cosine -max_samples: 10000 -max_steps: '50' -model_name_or_path: ../../models/Qwen-7B -num_train_epochs: 10.0 -output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241 -overwrite_cache: true -overwrite_output_dir: true -per_device_eval_batch_size: 2 -per_device_train_batch_size: 2 -plot_loss: true -preprocessing_num_workers: 16 -save_steps: 500 -stage: sft -template: qwen -val_size: 0.1 -warmup_ratio: 0.1 diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml deleted file mode 100644 index caa1505f..00000000 --- a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml +++ /dev/null @@ -1,31 +0,0 @@ -bf16: true -cutoff_len: 1024 -dataset: belle_1m -ddp_timeout: 180000000 -do_train: true -eval_steps: 500 -eval_strategy: steps -finetuning_type: lora -gradient_accumulation_steps: 8 -include_num_input_tokens_seen: true -include_tokens_per_second: true -learning_rate: 0.0001 -logging_steps: 3 -lora_target: all -lr_scheduler_type: cosine -max_samples: 10000 -max_steps: '50' -model_name_or_path: ../../models/Qwen-7B -num_train_epochs: 10.0 -output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758 -overwrite_cache: true -overwrite_output_dir: true -per_device_eval_batch_size: 2 -per_device_train_batch_size: 2 -plot_loss: true -preprocessing_num_workers: 16 -save_steps: 500 -stage: sft -template: qwen -val_size: 0.1 -warmup_ratio: 0.1 diff --git "a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/log.txt &\n train_pid=1720\n echo Start" "b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/log.txt &\n train_pid=1720\n echo Start" deleted file mode 100644 index e69de29b..00000000 diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml deleted file mode 100644 index 4631b614..00000000 --- a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml +++ /dev/null @@ -1,31 +0,0 @@ -bf16: true -cutoff_len: 1024 -dataset: belle_1m -ddp_timeout: 180000000 -do_train: true -eval_steps: 500 -eval_strategy: steps -finetuning_type: lora -gradient_accumulation_steps: 8 -include_num_input_tokens_seen: true -include_tokens_per_second: true -learning_rate: 0.0001 -logging_steps: 3 -lora_target: all -lr_scheduler_type: cosine -max_samples: 10000 -max_steps: '50' -model_name_or_path: ../../models/Qwen-7B -num_train_epochs: 10.0 -output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243 -overwrite_cache: true -overwrite_output_dir: true -per_device_eval_batch_size: 2 -per_device_train_batch_size: 2 -plot_loss: true -preprocessing_num_workers: 16 -save_steps: 500 -stage: sft -template: qwen -val_size: 0.1 -warmup_ratio: 0.1 diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/log.txt b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/log.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml deleted file mode 100644 index fe61b1e4..00000000 --- a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml +++ /dev/null @@ -1,31 +0,0 @@ -bf16: true -cutoff_len: 1024 -dataset: belle_1m -ddp_timeout: 180000000 -do_train: true -eval_steps: 500 -eval_strategy: steps -finetuning_type: lora -gradient_accumulation_steps: 8 -include_num_input_tokens_seen: true -include_tokens_per_second: true -learning_rate: 0.0001 -logging_steps: 3 -lora_target: all -lr_scheduler_type: cosine -max_samples: 10000 -max_steps: '50' -model_name_or_path: ../../models/Qwen-7B -num_train_epochs: 10.0 -output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736 -overwrite_cache: true -overwrite_output_dir: true -per_device_eval_batch_size: 2 -per_device_train_batch_size: 2 -plot_loss: true -preprocessing_num_workers: 16 -save_steps: 500 -stage: sft -template: qwen -val_size: 0.1 -warmup_ratio: 0.1 From 62a486dfc0151807016832b463ae45adb7fea163 Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 07:07:49 +0000 Subject: [PATCH 19/25] add: add test file --- .../log.txt | 0 ..._Qwen-7B_4_gpu_50_step_20240905070656.yaml | 31 +++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656/log.txt create mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656.yaml diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656/log.txt b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656/log.txt new file mode 100644 index 00000000..e69de29b diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656.yaml new file mode 100644 index 00000000..410ed726 --- /dev/null +++ b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656.yaml @@ -0,0 +1,31 @@ +bf16: true +cutoff_len: 1024 +dataset: belle_1m +ddp_timeout: 180000000 +do_train: true +eval_steps: 500 +eval_strategy: steps +finetuning_type: lora +gradient_accumulation_steps: 8 +include_num_input_tokens_seen: true +include_tokens_per_second: true +learning_rate: 0.0001 +logging_steps: 3 +lora_target: all +lr_scheduler_type: cosine +max_samples: 10000 +max_steps: '50' +model_name_or_path: ../../models/Qwen-7B +num_train_epochs: 10.0 +output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656 +overwrite_cache: true +overwrite_output_dir: true +per_device_eval_batch_size: 2 +per_device_train_batch_size: 2 +plot_loss: true +preprocessing_num_workers: 16 +save_steps: 500 +stage: sft +template: qwen +val_size: 0.1 +warmup_ratio: 0.1 From 5baa46a798168961d6c2fc68f49ad9309b606133 Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 15:15:37 +0800 Subject: [PATCH 20/25] fix: test fix --- prepare_yaml_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py index 36357f6b..e29993c6 100644 --- a/prepare_yaml_file.py +++ b/prepare_yaml_file.py @@ -28,7 +28,7 @@ def main(): model_name_or_path = "../../models/llama-2-7b-ms" template = "llama2" elif model == "Qwen-7B": - model_name_or_path = "../../models/Qwen-7B" + model_name_or_path = "'../../models/Qwen-7B'" template = "qwen" else: print("ERROR: model not supported.") From 3d018c82487239b309d0c44e6757d47079ddf00e Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 15:17:39 +0800 Subject: [PATCH 21/25] fix: fix typo --- run_once.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/run_once.sh b/run_once.sh index cf13de0b..ce85dc49 100644 --- a/run_once.sh +++ b/run_once.sh @@ -35,11 +35,11 @@ export USE_MODELSCOPE_HUB=1 # echo "${gpu_status_pid}" if [ "${gpu_cnt}"="1" ]; then - ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml | tee "${output_dir}/log.txt" & + ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yaml | tee "${output_dir}/log.txt" & train_pid=$! echo "Start train" else - FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yml | tee "${output_dir}/log.txt" & + FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yaml | tee "${output_dir}/log.txt" & train_pid=$! echo "Start train" fi From 2248960fe72b89738b0f7e5d0bd29e3bda63c470 Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 15:18:37 +0800 Subject: [PATCH 22/25] fix: fix format --- prepare_yaml_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py index e29993c6..36357f6b 100644 --- a/prepare_yaml_file.py +++ b/prepare_yaml_file.py @@ -28,7 +28,7 @@ def main(): model_name_or_path = "../../models/llama-2-7b-ms" template = "llama2" elif model == "Qwen-7B": - model_name_or_path = "'../../models/Qwen-7B'" + model_name_or_path = "../../models/Qwen-7B" template = "qwen" else: print("ERROR: model not supported.") From e754b62ccd2b23149d5ac5942e48d443c059897a Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 15:28:27 +0800 Subject: [PATCH 23/25] fix: test --- prepare_yaml_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py index 36357f6b..7428f229 100644 --- a/prepare_yaml_file.py +++ b/prepare_yaml_file.py @@ -28,7 +28,7 @@ def main(): model_name_or_path = "../../models/llama-2-7b-ms" template = "llama2" elif model == "Qwen-7B": - model_name_or_path = "../../models/Qwen-7B" + model_name_or_path = "/root/models/Qwen-7B" template = "qwen" else: print("ERROR: model not supported.") From f15e37dfad640378836930d640241a630182b82e Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 15:49:32 +0800 Subject: [PATCH 24/25] fix: fix bf16 --- prepare_yaml_file.py | 2 +- results/lora_sft_template.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py index 7428f229..36357f6b 100644 --- a/prepare_yaml_file.py +++ b/prepare_yaml_file.py @@ -28,7 +28,7 @@ def main(): model_name_or_path = "../../models/llama-2-7b-ms" template = "llama2" elif model == "Qwen-7B": - model_name_or_path = "/root/models/Qwen-7B" + model_name_or_path = "../../models/Qwen-7B" template = "qwen" else: print("ERROR: model not supported.") diff --git a/results/lora_sft_template.yaml b/results/lora_sft_template.yaml index a3b42642..9a4411e4 100644 --- a/results/lora_sft_template.yaml +++ b/results/lora_sft_template.yaml @@ -29,7 +29,7 @@ learning_rate: 1.0e-4 num_train_epochs: 10.0 lr_scheduler_type: cosine warmup_ratio: 0.1 -bf16: true +fp16: true ddp_timeout: 180000000 max_steps: 500 include_num_input_tokens_seen: true From 113966157cc5163aee909424a2515274534c5058 Mon Sep 17 00:00:00 2001 From: wql Date: Thu, 5 Sep 2024 15:54:33 +0800 Subject: [PATCH 25/25] fix: fix max steps --- prepare_yaml_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py index 36357f6b..fb27f279 100644 --- a/prepare_yaml_file.py +++ b/prepare_yaml_file.py @@ -43,7 +43,7 @@ def main(): config['template'] = template config['output_dir'] = output_dir if run_type == "lora_sft": - config['max_steps'] = max_steps + config['max_steps'] = int(max_steps) with open(f'{output_dir}/{run_name}.yaml', 'w', encoding='utf-8') as f: yaml.dump(data=config, stream=f, allow_unicode=True)