diff --git a/gpu_status.py b/gpu_status.py index f87d2e9c..8b1a0e5c 100644 --- a/gpu_status.py +++ b/gpu_status.py @@ -6,7 +6,7 @@ import psutil UNIT = 1024 * 1024 * 1024 -def gpu_status(output_path = "./results/gpu_status", print_status = False): +def gpu_status(output_path = "./results/gpu_status", print_status = False, sleep_time = 60): pynvml.nvmlInit() gpuDeviceCount = pynvml.nvmlDeviceGetCount() start_time = time.time() @@ -55,14 +55,15 @@ def gpu_status(output_path = "./results/gpu_status", print_status = False): if print_status: print(logs) - time.sleep(60) + time.sleep(sleep_time) pynvml.nvmlShutdown() def main(): output_path = sys.argv[1] print_status = sys.argv[2] - gpu_status(output_path, print_status) + sleep_time = sys.argv[3] + gpu_status(output_path, print_status, sleep_time) if __name__ == "__main__": main() diff --git a/prepare_yaml_file.py b/prepare_yaml_file.py index 69cb300e..f5b8abc0 100644 --- a/prepare_yaml_file.py +++ b/prepare_yaml_file.py @@ -47,6 +47,8 @@ def main(): with open(f'{output_dir}/{run_name}.yaml', 'w', encoding='utf-8') as f: yaml.dump(data=config, stream=f, allow_unicode=True) + + print(f"yaml file saved to {output_dir}/{run_name}.yaml") if __name__ == "__main__": main() \ No newline at end of file diff --git a/run_once.sh b/run_once.sh index 0df70c36..bd8b5d87 100644 --- a/run_once.sh +++ b/run_once.sh @@ -26,16 +26,16 @@ fi # echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}" python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir} -echo "yaml file save to ${output_dir}/${run_name}.yaml" +export USE_MODELSCOPE_HUB=1 -# export USE_MODELSCOPE_HUB=1 - -# # 0 means not printing gpu status -# python gpu_status.py ${output_dir} 0 & -# gpu_status_pid=$! -# echo "Start recording gpu status " +echo "Start recording gpu status " +# 0 means not printing gpu status +python gpu_status.py ${output_dir} 1 10 & +gpu_status_pid=$! +echo "${gpu_status_pid}" +sleep 60 # if [ "${gpu_cnt}"="1" ]; then # ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml \ @@ -52,5 +52,6 @@ echo "yaml file save to ${output_dir}/${run_name}.yaml" # wait $train_pid # echo "Train ended" # sleep 90 -# kill $gpu_status_pid -# echo "Gpu status ended" \ No newline at end of file + +kill $gpu_status_pid +echo "Gpu status ended" \ No newline at end of file