From fa9a9007f9d4fad0a5c404f85463bea43653c6c1 Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Wed, 4 Sep 2024 16:52:15 +0800
Subject: [PATCH 01/25] chore: add lora sft and predict template yaml file

---
 results/lora_sft_template.yaml | 42 ++++++++++++++++++++++++++++++++++
 results/predict_template.yaml  | 23 +++++++++++++++++++
 2 files changed, 65 insertions(+)
 create mode 100644 results/lora_sft_template.yaml
 create mode 100644 results/predict_template.yaml

diff --git a/results/lora_sft_template.yaml b/results/lora_sft_template.yaml
new file mode 100644
index 00000000..a3b42642
--- /dev/null
+++ b/results/lora_sft_template.yaml
@@ -0,0 +1,42 @@
+### model
+model_name_or_path: ../../llm/baichuan
+
+### method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: all
+
+### dataset
+dataset: belle_1m
+template: baichuan
+cutoff_len: 1024
+max_samples: 10000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+### output
+output_dir: ./results/lora_sft_2/Baichuan2-7B/Baichuan2_lora_sft_1_single_step500
+logging_steps: 3
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+per_device_train_batch_size: 2
+gradient_accumulation_steps: 8
+learning_rate: 1.0e-4
+num_train_epochs: 10.0
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+bf16: true
+ddp_timeout: 180000000
+max_steps: 500
+include_num_input_tokens_seen: true
+include_tokens_per_second: true
+
+### eval
+val_size: 0.1
+per_device_eval_batch_size: 2
+eval_strategy: steps
+eval_steps: 500
\ No newline at end of file
diff --git a/results/predict_template.yaml b/results/predict_template.yaml
new file mode 100644
index 00000000..bafb3f5f
--- /dev/null
+++ b/results/predict_template.yaml
@@ -0,0 +1,23 @@
+### model
+model_name_or_path: ../../llm/baichuan
+
+### method
+do_predict: true
+
+### dataset
+eval_dataset: alpaca_gpt4_zh
+template: baichuan
+cutoff_len: 1024
+max_samples: 50
+overwrite_cache: true
+preprocessing_num_workers: 16
+include_tokens_per_second: true
+
+### output
+output_dir: ./results/inference/Baichuan2-7B/Baichuan2_predict_1
+overwrite_output_dir: true
+
+### eval
+per_device_eval_batch_size: 2
+predict_with_generate: true
+ddp_timeout: 180000000

From 3e548489ed32a96688541dc37b458204615dea97 Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 11:28:19 +0800
Subject: [PATCH 02/25] feat: done easy run

---
 batch_run.sh         |  1 +
 gpu_status.py        | 26 ++++++++++++++++------
 prepare_yaml_file.py | 52 +++++++++++++++++++++++++++++++++++++++++++
 run_once.sh          | 53 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 125 insertions(+), 7 deletions(-)
 create mode 100644 batch_run.sh
 create mode 100644 prepare_yaml_file.py
 create mode 100644 run_once.sh

diff --git a/batch_run.sh b/batch_run.sh
new file mode 100644
index 00000000..1f3329e1
--- /dev/null
+++ b/batch_run.sh
@@ -0,0 +1 @@
+bash run_once.sh lora_sft Baichuan-7B 4 50
diff --git a/gpu_status.py b/gpu_status.py
index 9f89f519..f87d2e9c 100644
--- a/gpu_status.py
+++ b/gpu_status.py
@@ -1,16 +1,17 @@
 import json
+import sys
 import pynvml 
 import time
 import psutil
 
+UNIT = 1024 * 1024 * 1024
 
-def main():
-    UNIT = 1024 * 1024 * 1024
-
+def gpu_status(output_path = "./results/gpu_status", print_status = False):
     pynvml.nvmlInit()
     gpuDeviceCount = pynvml.nvmlDeviceGetCount()
     start_time = time.time()
-
+    first_loop = True
+    
     while time.time() - start_time < 3600 *24:
         # print(time.time() - start_time)
         all_gpu_status = []
@@ -43,14 +44,25 @@ def main():
             all_gpu_status = all_gpu_status,
             all_processes_status =  all_processes_status
         )
-        formatted_time = time.strftime('%Y%m%d%H%M%S', time.localtime())
-        with open(f"./results/gpu_status/gpu_status_{formatted_time}.json", "a", encoding="utf-8") as f:
+
+        with open(f"{output_path}/gpu_status.json", "a", encoding="utf-8") as f:
             f.write(json.dumps(logs) + "\n")
-        print(logs)
+        
+        if first_loop:
+            print("Start run gpu_status.py")
+            first_loop = False
+            
+        if print_status:
+            print(logs)
 
         time.sleep(60)
+        
     pynvml.nvmlShutdown()
 
+def main():
+    output_path = sys.argv[1]
+    print_status = sys.argv[2]
+    gpu_status(output_path, print_status)
 
 if __name__ == "__main__":
     main()
diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py
new file mode 100644
index 00000000..12dae8c5
--- /dev/null
+++ b/prepare_yaml_file.py
@@ -0,0 +1,52 @@
+import os
+import sys
+import time
+import yaml
+import json
+import pynvml 
+import time
+import psutil
+
+def main():
+    run_type = sys.argv[1]
+    model = sys.argv[2]
+    max_steps = sys.argv[3]
+    run_name = sys.argv[4]
+    output_dir = sys.argv[5]
+
+    if run_type == "lora_sft":
+        yaml_file = './results/lora_sft_template.yml'
+    elif run_type == "inference":
+        yaml_file = './results/predict_template.yml'
+      
+    if model == "9g-8B":
+        model_name_or_path = "../../models/sft_8b_v2"
+        template = ""
+    elif model == "Baichuan2-7B":
+        model_name_or_path = "../../models/Baichuan-7B"
+        template = "baichuan"
+    elif model == "ChatGLM2-6B":
+        model_name_or_path = "../../models/chatglm2-6b"
+        template = "chatglm2"
+    elif model == "Llama2-7B":
+        model_name_or_path = "../../models/llama-2-7b-ms"
+        template = "llama2"
+    elif model == "Qwen-7B":
+        model_name_or_path = "../../models/Qwen-7B"
+        template = "qwen"
+        
+    config = None
+    with open(yaml_file, 'r', encoding='utf-8') as f:
+        config = yaml.load(f.read(), Loader=yaml.FullLoader)
+        
+    config['model_name_or_path'] = model_name_or_path
+    config['template'] = template
+    config['output_dir'] = output_dir
+    if run_type == "lora_sft":
+        config['max_steps'] = max_steps
+
+    with open(f'{output_dir}/{run_name}.yml', 'w', encoding='utf-8') as f:
+        yaml.dump(data=config, stream=f, allow_unicode=True)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/run_once.sh b/run_once.sh
new file mode 100644
index 00000000..0d553c38
--- /dev/null
+++ b/run_once.sh
@@ -0,0 +1,53 @@
+run_type = $1
+model = $2
+gpu_cnt = $3
+max_steps = $4
+
+current_datetime=$(date +%Y%m%d%H%M%S)
+
+if [ "${run_type}" = "lora_sft" ]; then
+    run_name="${run_type}_${model}_${gpu_cnt}_gpu_${max_steps}_step_${current_datetime}"
+    
+else
+    run_name="${run_type}_${model}_${gpu_cnt}_gpu_${current_datetime}"
+fi
+
+output_dir ="./results/${run_name}"
+
+if [ ! -d "$output_dir" ]; then
+    mkdir -p "$output_dir"
+    echo "路径不存在，已创建: $output_dir"
+else
+    echo "路径已存在: $output_dir"
+fi
+
+echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}"
+python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir}
+
+
+
+# export USE_MODELSCOPE_HUB=1
+
+# # 0 means not printing gpu status
+# python gpu_status.py ${output_dir} 0 &
+# gpu_status_pid=$!
+# echo "Start recording gpu status "
+
+
+# if [ "${gpu_cnt}" = "1" ]; then
+#    ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml \
+#    | tee ${output_dir}/log.txt" &
+#    train_pid=$!
+#    echo "Start train"
+# else
+#     FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yml \
+#    | tee ${output_dir}/log.txt" &
+#    train_pid=$!
+#    echo "Start train"
+# fi
+
+# wait $train_pid
+# echo "Train ended"
+# sleep 90
+# kill $gpu_status_pid
+# echo "Gpu status ended"
\ No newline at end of file

From ae308991fbb10b65958228c4785fef30de7f2e63 Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 12:54:15 +0800
Subject: [PATCH 03/25] fix: fix first line

---
 run_once.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/run_once.sh b/run_once.sh
index 0d553c38..de3dd6e7 100644
--- a/run_once.sh
+++ b/run_once.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 run_type = $1
 model = $2
 gpu_cnt = $3

From 0cf37e5ec18bfd5df3fcbfee3f97d76a3345f5a2 Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 12:57:41 +0800
Subject: [PATCH 04/25] fix: fix para

---
 run_once.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/run_once.sh b/run_once.sh
index de3dd6e7..09786827 100644
--- a/run_once.sh
+++ b/run_once.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
 
-run_type = $1
-model = $2
-gpu_cnt = $3
-max_steps = $4
+run_type = "$1"
+model = "$2"
+gpu_cnt = "$3"
+max_steps = "$4"
 
 current_datetime=$(date +%Y%m%d%H%M%S)
 

From f23e9d417ee1cfcb9c2383c3e14cae7fa39cb5df Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 12:59:02 +0800
Subject: [PATCH 05/25] fix: fix space

---
 run_once.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/run_once.sh b/run_once.sh
index 09786827..7badc3fe 100644
--- a/run_once.sh
+++ b/run_once.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
-run_type = "$1"
-model = "$2"
-gpu_cnt = "$3"
-max_steps = "$4"
+run_type="$1"
+model="$2"
+gpu_cnt="$3"
+max_steps="$4"
 
 current_datetime=$(date +%Y%m%d%H%M%S)
 
-if [ "${run_type}" = "lora_sft" ]; then
+if [ "${run_type}"="lora_sft" ]; then
     run_name="${run_type}_${model}_${gpu_cnt}_gpu_${max_steps}_step_${current_datetime}"
     
 else
@@ -36,7 +36,7 @@ python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${outp
 # echo "Start recording gpu status "
 
 
-# if [ "${gpu_cnt}" = "1" ]; then
+# if [ "${gpu_cnt}"="1" ]; then
 #    ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml \
 #    | tee ${output_dir}/log.txt" &
 #    train_pid=$!

From 846fb7bfef830bf5e4477e826b06ae82e11870f8 Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 12:59:52 +0800
Subject: [PATCH 06/25] fix: fix space

---
 run_once.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_once.sh b/run_once.sh
index 7badc3fe..d084847a 100644
--- a/run_once.sh
+++ b/run_once.sh
@@ -14,7 +14,7 @@ else
     run_name="${run_type}_${model}_${gpu_cnt}_gpu_${current_datetime}"
 fi
 
-output_dir ="./results/${run_name}"
+output_dir="./results/${run_name}"
 
 if [ ! -d "$output_dir" ]; then
     mkdir -p "$output_dir"

From 4058fd7d6487266da73c2e55fdf67b3964ac0052 Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 13:01:49 +0800
Subject: [PATCH 07/25] fix: remove no use import

---
 prepare_yaml_file.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py
index 12dae8c5..2be0451d 100644
--- a/prepare_yaml_file.py
+++ b/prepare_yaml_file.py
@@ -1,11 +1,5 @@
-import os
 import sys
-import time
 import yaml
-import json
-import pynvml 
-import time
-import psutil
 
 def main():
     run_type = sys.argv[1]

From cc99691cf4fbd5f16ac63ace88875207dbcc9a98 Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 13:03:22 +0800
Subject: [PATCH 08/25] fix: fix file type

---
 prepare_yaml_file.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py
index 2be0451d..10931229 100644
--- a/prepare_yaml_file.py
+++ b/prepare_yaml_file.py
@@ -9,9 +9,9 @@ def main():
     output_dir = sys.argv[5]
 
     if run_type == "lora_sft":
-        yaml_file = './results/lora_sft_template.yml'
+        yaml_file = './results/lora_sft_template.yaml'
     elif run_type == "inference":
-        yaml_file = './results/predict_template.yml'
+        yaml_file = './results/predict_template.yaml'
       
     if model == "9g-8B":
         model_name_or_path = "../../models/sft_8b_v2"

From ceb01459feb5cbe53b4c4465ce29c21b1742d760 Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 13:04:54 +0800
Subject: [PATCH 09/25] fix: fix bug

---
 prepare_yaml_file.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py
index 10931229..4c864328 100644
--- a/prepare_yaml_file.py
+++ b/prepare_yaml_file.py
@@ -12,7 +12,9 @@ def main():
         yaml_file = './results/lora_sft_template.yaml'
     elif run_type == "inference":
         yaml_file = './results/predict_template.yaml'
-      
+    
+    model_name_or_path = ""
+    template = ""
     if model == "9g-8B":
         model_name_or_path = "../../models/sft_8b_v2"
         template = ""

From 95b4b493e6a6b30dbd14b608ad16c81b6490bf94 Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 13:09:43 +0800
Subject: [PATCH 10/25] chore: add echo

---
 prepare_yaml_file.py | 2 +-
 run_once.sh          | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py
index 4c864328..9ea58734 100644
--- a/prepare_yaml_file.py
+++ b/prepare_yaml_file.py
@@ -41,7 +41,7 @@ def main():
     if run_type == "lora_sft":
         config['max_steps'] = max_steps
 
-    with open(f'{output_dir}/{run_name}.yml', 'w', encoding='utf-8') as f:
+    with open(f'{output_dir}/{run_name}.yaml', 'w', encoding='utf-8') as f:
         yaml.dump(data=config, stream=f, allow_unicode=True)
 
 if __name__ == "__main__":
diff --git a/run_once.sh b/run_once.sh
index d084847a..fdef74a1 100644
--- a/run_once.sh
+++ b/run_once.sh
@@ -18,14 +18,15 @@ output_dir="./results/${run_name}"
 
 if [ ! -d "$output_dir" ]; then
     mkdir -p "$output_dir"
-    echo "路径不存在，已创建: $output_dir"
+    echo "output_dir created: $output_dir"
 else
-    echo "路径已存在: $output_dir"
+    echo "output_dir exists: $output_dir"
 fi
 
-echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}"
-python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir}
 
+# echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}"
+python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir}
+echo "yaml file save to {output_dir}/{run_name}.yaml"
 
 
 # export USE_MODELSCOPE_HUB=1

From 8162a54aa584155c237ba7833d11898ea32f6e56 Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 13:10:34 +0800
Subject: [PATCH 11/25] fix:small fix

---
 run_once.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_once.sh b/run_once.sh
index fdef74a1..0df70c36 100644
--- a/run_once.sh
+++ b/run_once.sh
@@ -26,7 +26,7 @@ fi
 
 # echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}"
 python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir}
-echo "yaml file save to {output_dir}/{run_name}.yaml"
+echo "yaml file save to ${output_dir}/${run_name}.yaml"
 
 
 # export USE_MODELSCOPE_HUB=1

From 64044380bd45ecca457284dc5ddcbcf08c635a5d Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 13:21:38 +0800
Subject: [PATCH 12/25] fix: add not supported model err msg

---
 batch_run.sh         |  2 +-
 prepare_yaml_file.py | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/batch_run.sh b/batch_run.sh
index 1f3329e1..fd9d5f59 100644
--- a/batch_run.sh
+++ b/batch_run.sh
@@ -1 +1 @@
-bash run_once.sh lora_sft Baichuan-7B 4 50
+bash run_once.sh lora_sft Qwen-7B 4 50
diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py
index 9ea58734..69cb300e 100644
--- a/prepare_yaml_file.py
+++ b/prepare_yaml_file.py
@@ -19,8 +19,8 @@ def main():
         model_name_or_path = "../../models/sft_8b_v2"
         template = ""
     elif model == "Baichuan2-7B":
-        model_name_or_path = "../../models/Baichuan-7B"
-        template = "baichuan"
+        model_name_or_path = "../../models/Baichuan2-7B"
+        template = "baichuan2"
     elif model == "ChatGLM2-6B":
         model_name_or_path = "../../models/chatglm2-6b"
         template = "chatglm2"
@@ -30,11 +30,15 @@ def main():
     elif model == "Qwen-7B":
         model_name_or_path = "../../models/Qwen-7B"
         template = "qwen"
+    else:
+        print("ERROR: model not supported.")
+        sys.exit()
         
     config = None
     with open(yaml_file, 'r', encoding='utf-8') as f:
         config = yaml.load(f.read(), Loader=yaml.FullLoader)
-        
+    
+    
     config['model_name_or_path'] = model_name_or_path
     config['template'] = template
     config['output_dir'] = output_dir

From 190fddf27d94e892b1843cdd241493ec30674c1d Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 13:37:17 +0800
Subject: [PATCH 13/25] fix: small change

---
 gpu_status.py        |  7 ++++---
 prepare_yaml_file.py |  2 ++
 run_once.sh          | 19 ++++++++++---------
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/gpu_status.py b/gpu_status.py
index f87d2e9c..8b1a0e5c 100644
--- a/gpu_status.py
+++ b/gpu_status.py
@@ -6,7 +6,7 @@ import psutil
 
 UNIT = 1024 * 1024 * 1024
 
-def gpu_status(output_path = "./results/gpu_status", print_status = False):
+def gpu_status(output_path = "./results/gpu_status", print_status = False, sleep_time = 60):
     pynvml.nvmlInit()
     gpuDeviceCount = pynvml.nvmlDeviceGetCount()
     start_time = time.time()
@@ -55,14 +55,15 @@ def gpu_status(output_path = "./results/gpu_status", print_status = False):
         if print_status:
             print(logs)
 
-        time.sleep(60)
+        time.sleep(sleep_time)
         
     pynvml.nvmlShutdown()
 
 def main():
     output_path = sys.argv[1]
     print_status = sys.argv[2]
-    gpu_status(output_path, print_status)
+    sleep_time = sys.argv[3]
+    gpu_status(output_path, print_status, sleep_time)
 
 if __name__ == "__main__":
     main()
diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py
index 69cb300e..f5b8abc0 100644
--- a/prepare_yaml_file.py
+++ b/prepare_yaml_file.py
@@ -47,6 +47,8 @@ def main():
 
     with open(f'{output_dir}/{run_name}.yaml', 'w', encoding='utf-8') as f:
         yaml.dump(data=config, stream=f, allow_unicode=True)
+    
+    print(f"yaml file saved to {output_dir}/{run_name}.yaml")
 
 if __name__ == "__main__":
     main()
\ No newline at end of file
diff --git a/run_once.sh b/run_once.sh
index 0df70c36..bd8b5d87 100644
--- a/run_once.sh
+++ b/run_once.sh
@@ -26,16 +26,16 @@ fi
 
 # echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}"
 python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir}
-echo "yaml file save to ${output_dir}/${run_name}.yaml"
 
+export USE_MODELSCOPE_HUB=1
 
-# export USE_MODELSCOPE_HUB=1
-
-# # 0 means not printing gpu status
-# python gpu_status.py ${output_dir} 0 &
-# gpu_status_pid=$!
-# echo "Start recording gpu status "
+echo "Start recording gpu status "
+# 0 means not printing gpu status
+python gpu_status.py ${output_dir} 1 10 &
+gpu_status_pid=$!
+echo "${gpu_status_pid}"
 
+sleep 60
 
 # if [ "${gpu_cnt}"="1" ]; then
 #    ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml \
@@ -52,5 +52,6 @@ echo "yaml file save to ${output_dir}/${run_name}.yaml"
 # wait $train_pid
 # echo "Train ended"
 # sleep 90
-# kill $gpu_status_pid
-# echo "Gpu status ended"
\ No newline at end of file
+
+kill $gpu_status_pid
+echo "Gpu status ended"
\ No newline at end of file

From 36840f031033aac24176662594dac7488627614d Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 14:35:47 +0800
Subject: [PATCH 14/25] fix: fix baichuan2 path

---
 prepare_yaml_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py
index f5b8abc0..36357f6b 100644
--- a/prepare_yaml_file.py
+++ b/prepare_yaml_file.py
@@ -19,7 +19,7 @@ def main():
         model_name_or_path = "../../models/sft_8b_v2"
         template = ""
     elif model == "Baichuan2-7B":
-        model_name_or_path = "../../models/Baichuan2-7B"
+        model_name_or_path = "../../models/Baichuan2-7B-Base"
         template = "baichuan2"
     elif model == "ChatGLM2-6B":
         model_name_or_path = "../../models/chatglm2-6b"

From ba90bf16255768b7ed097dd9c64085c25e05631c Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 14:41:48 +0800
Subject: [PATCH 15/25] chore: sort run_once

---
 run_once.sh | 43 ++++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/run_once.sh b/run_once.sh
index bd8b5d87..2c93c026 100644
--- a/run_once.sh
+++ b/run_once.sh
@@ -23,35 +23,32 @@ else
     echo "output_dir exists: $output_dir"
 fi
 
-
 # echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}"
 python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir}
 
 export USE_MODELSCOPE_HUB=1
 
-echo "Start recording gpu status "
-# 0 means not printing gpu status
-python gpu_status.py ${output_dir} 1 10 &
-gpu_status_pid=$!
-echo "${gpu_status_pid}"
+# echo "Start recording gpu status "
+# # 0 means not printing gpu status
+# python gpu_status.py ${output_dir} 1 10 &
+# gpu_status_pid=$!
+# echo "${gpu_status_pid}"
 
-sleep 60
+if [ "${gpu_cnt}"="1" ]; then
+   ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml \
+   | tee ${output_dir}/log.txt" &
+   train_pid=$!
+   echo "Start train"
+else
+    FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yml \
+   | tee ${output_dir}/log.txt" &
+   train_pid=$!
+   echo "Start train"
+fi
 
-# if [ "${gpu_cnt}"="1" ]; then
-#    ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml \
-#    | tee ${output_dir}/log.txt" &
-#    train_pid=$!
-#    echo "Start train"
-# else
-#     FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yml \
-#    | tee ${output_dir}/log.txt" &
-#    train_pid=$!
-#    echo "Start train"
-# fi
+wait $train_pid
+echo "Train ended"
 
-# wait $train_pid
-# echo "Train ended"
 # sleep 90
-
-kill $gpu_status_pid
-echo "Gpu status ended"
\ No newline at end of file
+# kill $gpu_status_pid
+# echo "Gpu status ended"
\ No newline at end of file

From f71f62f2f6bc6ec29bedb7aded36d9991d5de60c Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 14:47:17 +0800
Subject: [PATCH 16/25] fix: fix typo

---
 run_once.sh | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/run_once.sh b/run_once.sh
index 2c93c026..cf13de0b 100644
--- a/run_once.sh
+++ b/run_once.sh
@@ -35,13 +35,11 @@ export USE_MODELSCOPE_HUB=1
 # echo "${gpu_status_pid}"
 
 if [ "${gpu_cnt}"="1" ]; then
-   ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml \
-   | tee ${output_dir}/log.txt" &
+   ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml | tee "${output_dir}/log.txt" &
    train_pid=$!
    echo "Start train"
 else
-    FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yml \
-   | tee ${output_dir}/log.txt" &
+   FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yml | tee "${output_dir}/log.txt" &
    train_pid=$!
    echo "Start train"
 fi

From ab4bf8bd4dc6b6bf5c10f4d32a61cb9b9dc935ac Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 06:52:33 +0000
Subject: [PATCH 17/25] add: add all test results

---
 ...ichuan-7B_4_gpu_50_step_20240905050501.yml | 31 +++++++++++++++++++
 ...chuan-7B_4_gpu_50_step_20240905050958.yaml | 31 +++++++++++++++++++
 ...chuan-7B_4_gpu_50_step_20240905051039.yaml | 31 +++++++++++++++++++
 ..._Qwen-7B_4_gpu_50_step_20240905052241.yaml | 31 +++++++++++++++++++
 ..._Qwen-7B_4_gpu_50_step_20240905053758.yaml | 31 +++++++++++++++++++
 ...g.txt &\n   train_pid=1720\n   echo Start" |  0
 ..._Qwen-7B_4_gpu_50_step_20240905064243.yaml | 31 +++++++++++++++++++
 .../log.txt                                   |  0
 ..._Qwen-7B_4_gpu_50_step_20240905064736.yaml | 31 +++++++++++++++++++
 9 files changed, 217 insertions(+)
 create mode 100644 results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml
 create mode 100644 results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml
 create mode 100644 results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml
 create mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml
 create mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml
 create mode 100644 "results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/log.txt &\n   train_pid=1720\n   echo Start"
 create mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml
 create mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/log.txt
 create mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml

diff --git a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml
new file mode 100644
index 00000000..26507813
--- /dev/null
+++ b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml
@@ -0,0 +1,31 @@
+bf16: true
+cutoff_len: 1024
+dataset: belle_1m
+ddp_timeout: 180000000
+do_train: true
+eval_steps: 500
+eval_strategy: steps
+finetuning_type: lora
+gradient_accumulation_steps: 8
+include_num_input_tokens_seen: true
+include_tokens_per_second: true
+learning_rate: 0.0001
+logging_steps: 3
+lora_target: all
+lr_scheduler_type: cosine
+max_samples: 10000
+max_steps: '50'
+model_name_or_path: ''
+num_train_epochs: 10.0
+output_dir: ./results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501
+overwrite_cache: true
+overwrite_output_dir: true
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2
+plot_loss: true
+preprocessing_num_workers: 16
+save_steps: 500
+stage: sft
+template: ''
+val_size: 0.1
+warmup_ratio: 0.1
diff --git a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml
new file mode 100644
index 00000000..e041b60b
--- /dev/null
+++ b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml
@@ -0,0 +1,31 @@
+bf16: true
+cutoff_len: 1024
+dataset: belle_1m
+ddp_timeout: 180000000
+do_train: true
+eval_steps: 500
+eval_strategy: steps
+finetuning_type: lora
+gradient_accumulation_steps: 8
+include_num_input_tokens_seen: true
+include_tokens_per_second: true
+learning_rate: 0.0001
+logging_steps: 3
+lora_target: all
+lr_scheduler_type: cosine
+max_samples: 10000
+max_steps: '50'
+model_name_or_path: ''
+num_train_epochs: 10.0
+output_dir: ./results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958
+overwrite_cache: true
+overwrite_output_dir: true
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2
+plot_loss: true
+preprocessing_num_workers: 16
+save_steps: 500
+stage: sft
+template: ''
+val_size: 0.1
+warmup_ratio: 0.1
diff --git a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml
new file mode 100644
index 00000000..84e13b18
--- /dev/null
+++ b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml
@@ -0,0 +1,31 @@
+bf16: true
+cutoff_len: 1024
+dataset: belle_1m
+ddp_timeout: 180000000
+do_train: true
+eval_steps: 500
+eval_strategy: steps
+finetuning_type: lora
+gradient_accumulation_steps: 8
+include_num_input_tokens_seen: true
+include_tokens_per_second: true
+learning_rate: 0.0001
+logging_steps: 3
+lora_target: all
+lr_scheduler_type: cosine
+max_samples: 10000
+max_steps: '50'
+model_name_or_path: ''
+num_train_epochs: 10.0
+output_dir: ./results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039
+overwrite_cache: true
+overwrite_output_dir: true
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2
+plot_loss: true
+preprocessing_num_workers: 16
+save_steps: 500
+stage: sft
+template: ''
+val_size: 0.1
+warmup_ratio: 0.1
diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml
new file mode 100644
index 00000000..2a1de0fe
--- /dev/null
+++ b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml
@@ -0,0 +1,31 @@
+bf16: true
+cutoff_len: 1024
+dataset: belle_1m
+ddp_timeout: 180000000
+do_train: true
+eval_steps: 500
+eval_strategy: steps
+finetuning_type: lora
+gradient_accumulation_steps: 8
+include_num_input_tokens_seen: true
+include_tokens_per_second: true
+learning_rate: 0.0001
+logging_steps: 3
+lora_target: all
+lr_scheduler_type: cosine
+max_samples: 10000
+max_steps: '50'
+model_name_or_path: ../../models/Qwen-7B
+num_train_epochs: 10.0
+output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241
+overwrite_cache: true
+overwrite_output_dir: true
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2
+plot_loss: true
+preprocessing_num_workers: 16
+save_steps: 500
+stage: sft
+template: qwen
+val_size: 0.1
+warmup_ratio: 0.1
diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml
new file mode 100644
index 00000000..caa1505f
--- /dev/null
+++ b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml
@@ -0,0 +1,31 @@
+bf16: true
+cutoff_len: 1024
+dataset: belle_1m
+ddp_timeout: 180000000
+do_train: true
+eval_steps: 500
+eval_strategy: steps
+finetuning_type: lora
+gradient_accumulation_steps: 8
+include_num_input_tokens_seen: true
+include_tokens_per_second: true
+learning_rate: 0.0001
+logging_steps: 3
+lora_target: all
+lr_scheduler_type: cosine
+max_samples: 10000
+max_steps: '50'
+model_name_or_path: ../../models/Qwen-7B
+num_train_epochs: 10.0
+output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758
+overwrite_cache: true
+overwrite_output_dir: true
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2
+plot_loss: true
+preprocessing_num_workers: 16
+save_steps: 500
+stage: sft
+template: qwen
+val_size: 0.1
+warmup_ratio: 0.1
diff --git "a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/log.txt &\n   train_pid=1720\n   echo Start" "b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/log.txt &\n   train_pid=1720\n   echo Start"
new file mode 100644
index 00000000..e69de29b
diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml
new file mode 100644
index 00000000..4631b614
--- /dev/null
+++ b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml
@@ -0,0 +1,31 @@
+bf16: true
+cutoff_len: 1024
+dataset: belle_1m
+ddp_timeout: 180000000
+do_train: true
+eval_steps: 500
+eval_strategy: steps
+finetuning_type: lora
+gradient_accumulation_steps: 8
+include_num_input_tokens_seen: true
+include_tokens_per_second: true
+learning_rate: 0.0001
+logging_steps: 3
+lora_target: all
+lr_scheduler_type: cosine
+max_samples: 10000
+max_steps: '50'
+model_name_or_path: ../../models/Qwen-7B
+num_train_epochs: 10.0
+output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243
+overwrite_cache: true
+overwrite_output_dir: true
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2
+plot_loss: true
+preprocessing_num_workers: 16
+save_steps: 500
+stage: sft
+template: qwen
+val_size: 0.1
+warmup_ratio: 0.1
diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/log.txt b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/log.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml
new file mode 100644
index 00000000..fe61b1e4
--- /dev/null
+++ b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml
@@ -0,0 +1,31 @@
+bf16: true
+cutoff_len: 1024
+dataset: belle_1m
+ddp_timeout: 180000000
+do_train: true
+eval_steps: 500
+eval_strategy: steps
+finetuning_type: lora
+gradient_accumulation_steps: 8
+include_num_input_tokens_seen: true
+include_tokens_per_second: true
+learning_rate: 0.0001
+logging_steps: 3
+lora_target: all
+lr_scheduler_type: cosine
+max_samples: 10000
+max_steps: '50'
+model_name_or_path: ../../models/Qwen-7B
+num_train_epochs: 10.0
+output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736
+overwrite_cache: true
+overwrite_output_dir: true
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2
+plot_loss: true
+preprocessing_num_workers: 16
+save_steps: 500
+stage: sft
+template: qwen
+val_size: 0.1
+warmup_ratio: 0.1

From c6a4d43c068cc6e1f680e7c2f08603800b598039 Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 07:05:47 +0000
Subject: [PATCH 18/25] fix: remove no need test file

---
 ...ichuan-7B_4_gpu_50_step_20240905050501.yml | 31 -------------------
 ...chuan-7B_4_gpu_50_step_20240905050958.yaml | 31 -------------------
 ...chuan-7B_4_gpu_50_step_20240905051039.yaml | 31 -------------------
 ..._Qwen-7B_4_gpu_50_step_20240905052241.yaml | 31 -------------------
 ..._Qwen-7B_4_gpu_50_step_20240905053758.yaml | 31 -------------------
 ...g.txt &\n   train_pid=1720\n   echo Start" |  0
 ..._Qwen-7B_4_gpu_50_step_20240905064243.yaml | 31 -------------------
 .../log.txt                                   |  0
 ..._Qwen-7B_4_gpu_50_step_20240905064736.yaml | 31 -------------------
 9 files changed, 217 deletions(-)
 delete mode 100644 results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml
 delete mode 100644 results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml
 delete mode 100644 results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml
 delete mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml
 delete mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml
 delete mode 100644 "results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/log.txt &\n   train_pid=1720\n   echo Start"
 delete mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml
 delete mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/log.txt
 delete mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml

diff --git a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml
deleted file mode 100644
index 26507813..00000000
--- a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-bf16: true
-cutoff_len: 1024
-dataset: belle_1m
-ddp_timeout: 180000000
-do_train: true
-eval_steps: 500
-eval_strategy: steps
-finetuning_type: lora
-gradient_accumulation_steps: 8
-include_num_input_tokens_seen: true
-include_tokens_per_second: true
-learning_rate: 0.0001
-logging_steps: 3
-lora_target: all
-lr_scheduler_type: cosine
-max_samples: 10000
-max_steps: '50'
-model_name_or_path: ''
-num_train_epochs: 10.0
-output_dir: ./results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050501
-overwrite_cache: true
-overwrite_output_dir: true
-per_device_eval_batch_size: 2
-per_device_train_batch_size: 2
-plot_loss: true
-preprocessing_num_workers: 16
-save_steps: 500
-stage: sft
-template: ''
-val_size: 0.1
-warmup_ratio: 0.1
diff --git a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml
deleted file mode 100644
index e041b60b..00000000
--- a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-bf16: true
-cutoff_len: 1024
-dataset: belle_1m
-ddp_timeout: 180000000
-do_train: true
-eval_steps: 500
-eval_strategy: steps
-finetuning_type: lora
-gradient_accumulation_steps: 8
-include_num_input_tokens_seen: true
-include_tokens_per_second: true
-learning_rate: 0.0001
-logging_steps: 3
-lora_target: all
-lr_scheduler_type: cosine
-max_samples: 10000
-max_steps: '50'
-model_name_or_path: ''
-num_train_epochs: 10.0
-output_dir: ./results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905050958
-overwrite_cache: true
-overwrite_output_dir: true
-per_device_eval_batch_size: 2
-per_device_train_batch_size: 2
-plot_loss: true
-preprocessing_num_workers: 16
-save_steps: 500
-stage: sft
-template: ''
-val_size: 0.1
-warmup_ratio: 0.1
diff --git a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml b/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml
deleted file mode 100644
index 84e13b18..00000000
--- a/results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-bf16: true
-cutoff_len: 1024
-dataset: belle_1m
-ddp_timeout: 180000000
-do_train: true
-eval_steps: 500
-eval_strategy: steps
-finetuning_type: lora
-gradient_accumulation_steps: 8
-include_num_input_tokens_seen: true
-include_tokens_per_second: true
-learning_rate: 0.0001
-logging_steps: 3
-lora_target: all
-lr_scheduler_type: cosine
-max_samples: 10000
-max_steps: '50'
-model_name_or_path: ''
-num_train_epochs: 10.0
-output_dir: ./results/lora_sft_Baichuan-7B_4_gpu_50_step_20240905051039
-overwrite_cache: true
-overwrite_output_dir: true
-per_device_eval_batch_size: 2
-per_device_train_batch_size: 2
-plot_loss: true
-preprocessing_num_workers: 16
-save_steps: 500
-stage: sft
-template: ''
-val_size: 0.1
-warmup_ratio: 0.1
diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml
deleted file mode 100644
index 2a1de0fe..00000000
--- a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-bf16: true
-cutoff_len: 1024
-dataset: belle_1m
-ddp_timeout: 180000000
-do_train: true
-eval_steps: 500
-eval_strategy: steps
-finetuning_type: lora
-gradient_accumulation_steps: 8
-include_num_input_tokens_seen: true
-include_tokens_per_second: true
-learning_rate: 0.0001
-logging_steps: 3
-lora_target: all
-lr_scheduler_type: cosine
-max_samples: 10000
-max_steps: '50'
-model_name_or_path: ../../models/Qwen-7B
-num_train_epochs: 10.0
-output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905052241
-overwrite_cache: true
-overwrite_output_dir: true
-per_device_eval_batch_size: 2
-per_device_train_batch_size: 2
-plot_loss: true
-preprocessing_num_workers: 16
-save_steps: 500
-stage: sft
-template: qwen
-val_size: 0.1
-warmup_ratio: 0.1
diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml
deleted file mode 100644
index caa1505f..00000000
--- a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-bf16: true
-cutoff_len: 1024
-dataset: belle_1m
-ddp_timeout: 180000000
-do_train: true
-eval_steps: 500
-eval_strategy: steps
-finetuning_type: lora
-gradient_accumulation_steps: 8
-include_num_input_tokens_seen: true
-include_tokens_per_second: true
-learning_rate: 0.0001
-logging_steps: 3
-lora_target: all
-lr_scheduler_type: cosine
-max_samples: 10000
-max_steps: '50'
-model_name_or_path: ../../models/Qwen-7B
-num_train_epochs: 10.0
-output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905053758
-overwrite_cache: true
-overwrite_output_dir: true
-per_device_eval_batch_size: 2
-per_device_train_batch_size: 2
-plot_loss: true
-preprocessing_num_workers: 16
-save_steps: 500
-stage: sft
-template: qwen
-val_size: 0.1
-warmup_ratio: 0.1
diff --git "a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/log.txt &\n   train_pid=1720\n   echo Start" "b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/log.txt &\n   train_pid=1720\n   echo Start"
deleted file mode 100644
index e69de29b..00000000
diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml
deleted file mode 100644
index 4631b614..00000000
--- a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-bf16: true
-cutoff_len: 1024
-dataset: belle_1m
-ddp_timeout: 180000000
-do_train: true
-eval_steps: 500
-eval_strategy: steps
-finetuning_type: lora
-gradient_accumulation_steps: 8
-include_num_input_tokens_seen: true
-include_tokens_per_second: true
-learning_rate: 0.0001
-logging_steps: 3
-lora_target: all
-lr_scheduler_type: cosine
-max_samples: 10000
-max_steps: '50'
-model_name_or_path: ../../models/Qwen-7B
-num_train_epochs: 10.0
-output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064243
-overwrite_cache: true
-overwrite_output_dir: true
-per_device_eval_batch_size: 2
-per_device_train_batch_size: 2
-plot_loss: true
-preprocessing_num_workers: 16
-save_steps: 500
-stage: sft
-template: qwen
-val_size: 0.1
-warmup_ratio: 0.1
diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/log.txt b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/log.txt
deleted file mode 100644
index e69de29b..00000000
diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml
deleted file mode 100644
index fe61b1e4..00000000
--- a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-bf16: true
-cutoff_len: 1024
-dataset: belle_1m
-ddp_timeout: 180000000
-do_train: true
-eval_steps: 500
-eval_strategy: steps
-finetuning_type: lora
-gradient_accumulation_steps: 8
-include_num_input_tokens_seen: true
-include_tokens_per_second: true
-learning_rate: 0.0001
-logging_steps: 3
-lora_target: all
-lr_scheduler_type: cosine
-max_samples: 10000
-max_steps: '50'
-model_name_or_path: ../../models/Qwen-7B
-num_train_epochs: 10.0
-output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905064736
-overwrite_cache: true
-overwrite_output_dir: true
-per_device_eval_batch_size: 2
-per_device_train_batch_size: 2
-plot_loss: true
-preprocessing_num_workers: 16
-save_steps: 500
-stage: sft
-template: qwen
-val_size: 0.1
-warmup_ratio: 0.1

From 62a486dfc0151807016832b463ae45adb7fea163 Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 07:07:49 +0000
Subject: [PATCH 19/25] add: add test file

---
 .../log.txt                                   |  0
 ..._Qwen-7B_4_gpu_50_step_20240905070656.yaml | 31 +++++++++++++++++++
 2 files changed, 31 insertions(+)
 create mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656/log.txt
 create mode 100644 results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656.yaml

diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656/log.txt b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656/log.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656.yaml b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656.yaml
new file mode 100644
index 00000000..410ed726
--- /dev/null
+++ b/results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656.yaml
@@ -0,0 +1,31 @@
+bf16: true
+cutoff_len: 1024
+dataset: belle_1m
+ddp_timeout: 180000000
+do_train: true
+eval_steps: 500
+eval_strategy: steps
+finetuning_type: lora
+gradient_accumulation_steps: 8
+include_num_input_tokens_seen: true
+include_tokens_per_second: true
+learning_rate: 0.0001
+logging_steps: 3
+lora_target: all
+lr_scheduler_type: cosine
+max_samples: 10000
+max_steps: '50'
+model_name_or_path: ../../models/Qwen-7B
+num_train_epochs: 10.0
+output_dir: ./results/lora_sft_Qwen-7B_4_gpu_50_step_20240905070656
+overwrite_cache: true
+overwrite_output_dir: true
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2
+plot_loss: true
+preprocessing_num_workers: 16
+save_steps: 500
+stage: sft
+template: qwen
+val_size: 0.1
+warmup_ratio: 0.1

From 5baa46a798168961d6c2fc68f49ad9309b606133 Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 15:15:37 +0800
Subject: [PATCH 20/25] fix: test fix

---
 prepare_yaml_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py
index 36357f6b..e29993c6 100644
--- a/prepare_yaml_file.py
+++ b/prepare_yaml_file.py
@@ -28,7 +28,7 @@ def main():
         model_name_or_path = "../../models/llama-2-7b-ms"
         template = "llama2"
     elif model == "Qwen-7B":
-        model_name_or_path = "../../models/Qwen-7B"
+        model_name_or_path = "'../../models/Qwen-7B'"
         template = "qwen"
     else:
         print("ERROR: model not supported.")

From 3d018c82487239b309d0c44e6757d47079ddf00e Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 15:17:39 +0800
Subject: [PATCH 21/25] fix: fix typo

---
 run_once.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/run_once.sh b/run_once.sh
index cf13de0b..ce85dc49 100644
--- a/run_once.sh
+++ b/run_once.sh
@@ -35,11 +35,11 @@ export USE_MODELSCOPE_HUB=1
 # echo "${gpu_status_pid}"
 
 if [ "${gpu_cnt}"="1" ]; then
-   ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml | tee "${output_dir}/log.txt" &
+   ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yaml | tee "${output_dir}/log.txt" &
    train_pid=$!
    echo "Start train"
 else
-   FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yml | tee "${output_dir}/log.txt" &
+   FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yaml | tee "${output_dir}/log.txt" &
    train_pid=$!
    echo "Start train"
 fi

From 2248960fe72b89738b0f7e5d0bd29e3bda63c470 Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 15:18:37 +0800
Subject: [PATCH 22/25] fix: fix format

---
 prepare_yaml_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py
index e29993c6..36357f6b 100644
--- a/prepare_yaml_file.py
+++ b/prepare_yaml_file.py
@@ -28,7 +28,7 @@ def main():
         model_name_or_path = "../../models/llama-2-7b-ms"
         template = "llama2"
     elif model == "Qwen-7B":
-        model_name_or_path = "'../../models/Qwen-7B'"
+        model_name_or_path = "../../models/Qwen-7B"
         template = "qwen"
     else:
         print("ERROR: model not supported.")

From e754b62ccd2b23149d5ac5942e48d443c059897a Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 15:28:27 +0800
Subject: [PATCH 23/25] fix: test

---
 prepare_yaml_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py
index 36357f6b..7428f229 100644
--- a/prepare_yaml_file.py
+++ b/prepare_yaml_file.py
@@ -28,7 +28,7 @@ def main():
         model_name_or_path = "../../models/llama-2-7b-ms"
         template = "llama2"
     elif model == "Qwen-7B":
-        model_name_or_path = "../../models/Qwen-7B"
+        model_name_or_path = "/root/models/Qwen-7B"
         template = "qwen"
     else:
         print("ERROR: model not supported.")

From f15e37dfad640378836930d640241a630182b82e Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 15:49:32 +0800
Subject: [PATCH 24/25] fix: fix bf16

---
 prepare_yaml_file.py           | 2 +-
 results/lora_sft_template.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py
index 7428f229..36357f6b 100644
--- a/prepare_yaml_file.py
+++ b/prepare_yaml_file.py
@@ -28,7 +28,7 @@ def main():
         model_name_or_path = "../../models/llama-2-7b-ms"
         template = "llama2"
     elif model == "Qwen-7B":
-        model_name_or_path = "/root/models/Qwen-7B"
+        model_name_or_path = "../../models/Qwen-7B"
         template = "qwen"
     else:
         print("ERROR: model not supported.")
diff --git a/results/lora_sft_template.yaml b/results/lora_sft_template.yaml
index a3b42642..9a4411e4 100644
--- a/results/lora_sft_template.yaml
+++ b/results/lora_sft_template.yaml
@@ -29,7 +29,7 @@ learning_rate: 1.0e-4
 num_train_epochs: 10.0
 lr_scheduler_type: cosine
 warmup_ratio: 0.1
-bf16: true
+fp16: true
 ddp_timeout: 180000000
 max_steps: 500
 include_num_input_tokens_seen: true

From 113966157cc5163aee909424a2515274534c5058 Mon Sep 17 00:00:00 2001
From: wql <wengqinlan18@163.com>
Date: Thu, 5 Sep 2024 15:54:33 +0800
Subject: [PATCH 25/25] fix: fix max steps

---
 prepare_yaml_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py
index 36357f6b..fb27f279 100644
--- a/prepare_yaml_file.py
+++ b/prepare_yaml_file.py
@@ -43,7 +43,7 @@ def main():
     config['template'] = template
     config['output_dir'] = output_dir
     if run_type == "lora_sft":
-        config['max_steps'] = max_steps
+        config['max_steps'] = int(max_steps)
 
     with open(f'{output_dir}/{run_name}.yaml', 'w', encoding='utf-8') as f:
         yaml.dump(data=config, stream=f, allow_unicode=True)