diff --git a/batch_run.sh b/batch_run.sh new file mode 100644 index 00000000..1f3329e1 --- /dev/null +++ b/batch_run.sh @@ -0,0 +1 @@ +bash run_once.sh lora_sft Baichuan-7B 4 50 diff --git a/gpu_status.py b/gpu_status.py index 9f89f519..f87d2e9c 100644 --- a/gpu_status.py +++ b/gpu_status.py @@ -1,16 +1,17 @@ import json +import sys import pynvml import time import psutil +UNIT = 1024 * 1024 * 1024 -def main(): - UNIT = 1024 * 1024 * 1024 - +def gpu_status(output_path = "./results/gpu_status", print_status = False): pynvml.nvmlInit() gpuDeviceCount = pynvml.nvmlDeviceGetCount() start_time = time.time() - + first_loop = True + while time.time() - start_time < 3600 *24: # print(time.time() - start_time) all_gpu_status = [] @@ -43,14 +44,25 @@ def main(): all_gpu_status = all_gpu_status, all_processes_status = all_processes_status ) - formatted_time = time.strftime('%Y%m%d%H%M%S', time.localtime()) - with open(f"./results/gpu_status/gpu_status_{formatted_time}.json", "a", encoding="utf-8") as f: + + with open(f"{output_path}/gpu_status.json", "a", encoding="utf-8") as f: f.write(json.dumps(logs) + "\n") - print(logs) + + if first_loop: + print("Start run gpu_status.py") + first_loop = False + + if print_status: + print(logs) time.sleep(60) + pynvml.nvmlShutdown() +def main(): + output_path = sys.argv[1] + print_status = sys.argv[2] + gpu_status(output_path, print_status) if __name__ == "__main__": main() diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py new file mode 100644 index 00000000..12dae8c5 --- /dev/null +++ b/prepare_yaml_file.py @@ -0,0 +1,52 @@ +import os +import sys +import time +import yaml +import json +import pynvml +import time +import psutil + +def main(): + run_type = sys.argv[1] + model = sys.argv[2] + max_steps = sys.argv[3] + run_name = sys.argv[4] + output_dir = sys.argv[5] + + if run_type == "lora_sft": + yaml_file = './results/lora_sft_template.yml' + elif run_type == "inference": + yaml_file = './results/predict_template.yml' + + if model == "9g-8B": + model_name_or_path = "../../models/sft_8b_v2" + template = "" + elif model == "Baichuan2-7B": + model_name_or_path = "../../models/Baichuan-7B" + template = "baichuan" + elif model == "ChatGLM2-6B": + model_name_or_path = "../../models/chatglm2-6b" + template = "chatglm2" + elif model == "Llama2-7B": + model_name_or_path = "../../models/llama-2-7b-ms" + template = "llama2" + elif model == "Qwen-7B": + model_name_or_path = "../../models/Qwen-7B" + template = "qwen" + + config = None + with open(yaml_file, 'r', encoding='utf-8') as f: + config = yaml.load(f.read(), Loader=yaml.FullLoader) + + config['model_name_or_path'] = model_name_or_path + config['template'] = template + config['output_dir'] = output_dir + if run_type == "lora_sft": + config['max_steps'] = max_steps + + with open(f'{output_dir}/{run_name}.yml', 'w', encoding='utf-8') as f: + yaml.dump(data=config, stream=f, allow_unicode=True) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/run_once.sh b/run_once.sh new file mode 100644 index 00000000..0d553c38 --- /dev/null +++ b/run_once.sh @@ -0,0 +1,53 @@ +run_type = $1 +model = $2 +gpu_cnt = $3 +max_steps = $4 + +current_datetime=$(date +%Y%m%d%H%M%S) + +if [ "${run_type}" = "lora_sft" ]; then + run_name="${run_type}_${model}_${gpu_cnt}_gpu_${max_steps}_step_${current_datetime}" + +else + run_name="${run_type}_${model}_${gpu_cnt}_gpu_${current_datetime}" +fi + +output_dir ="./results/${run_name}" + +if [ ! -d "$output_dir" ]; then + mkdir -p "$output_dir" + echo "路径不存在,已创建: $output_dir" +else + echo "路径已存在: $output_dir" +fi + +echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}" +python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir} + + + +# export USE_MODELSCOPE_HUB=1 + +# # 0 means not printing gpu status +# python gpu_status.py ${output_dir} 0 & +# gpu_status_pid=$! +# echo "Start recording gpu status " + + +# if [ "${gpu_cnt}" = "1" ]; then +# ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml \ +# | tee ${output_dir}/log.txt" & +# train_pid=$! +# echo "Start train" +# else +# FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yml \ +# | tee ${output_dir}/log.txt" & +# train_pid=$! +# echo "Start train" +# fi + +# wait $train_pid +# echo "Train ended" +# sleep 90 +# kill $gpu_status_pid +# echo "Gpu status ended" \ No newline at end of file