fix: small change

2024-09-05 13:37:17 +08:00 · 2024-09-05 13:37:17 +08:00 · 190fddf27d
parent 64044380bd
commit 190fddf27d
3 changed files with 16 additions and 12 deletions
--- a/gpu_status.py
+++ b/gpu_status.py
@ -6,7 +6,7 @@ import psutil
 UNIT = 1024 * 1024 * 1024
-def gpu_status(output_path = "./results/gpu_status", print_status = False):
+def gpu_status(output_path = "./results/gpu_status", print_status = False, sleep_time = 60):
    pynvml.nvmlInit()
    gpuDeviceCount = pynvml.nvmlDeviceGetCount()
    start_time = time.time()
@ -55,14 +55,15 @@ def gpu_status(output_path = "./results/gpu_status", print_status = False):
        if print_status:
            print(logs)
-        time.sleep(60)
+        time.sleep(sleep_time)
    pynvml.nvmlShutdown()
 def main():
    output_path = sys.argv[1]
    print_status = sys.argv[2]
-    gpu_status(output_path, print_status)
+    sleep_time = sys.argv[3]
    gpu_status(output_path, print_status, sleep_time)
 if __name__ == "__main__":
    main()
--- a/prepare_yaml_file.py
+++ b/prepare_yaml_file.py
@ -47,6 +47,8 @@ def main():
    with open(f'{output_dir}/{run_name}.yaml', 'w', encoding='utf-8') as f:
        yaml.dump(data=config, stream=f, allow_unicode=True)
    print(f"yaml file saved to {output_dir}/{run_name}.yaml")
 if __name__ == "__main__":
    main()
--- a/run_once.sh
+++ b/run_once.sh
@ -26,16 +26,16 @@ fi
 # echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}"
 python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir}
 echo "yaml file save to ${output_dir}/${run_name}.yaml"
 export USE_MODELSCOPE_HUB=1
-# export USE_MODELSCOPE_HUB=1
+echo "Start recording gpu status "
-
+# 0 means not printing gpu status
-# # 0 means not printing gpu status
+python gpu_status.py ${output_dir} 1 10 &
-# python gpu_status.py ${output_dir} 0 &
+gpu_status_pid=$!
-# gpu_status_pid=$!
+echo "${gpu_status_pid}"
 # echo "Start recording gpu status "
 sleep 60
 # if [ "${gpu_cnt}"="1" ]; then
 #    ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yml \
@ -52,5 +52,6 @@ echo "yaml file save to ${output_dir}/${run_name}.yaml"
 # wait $train_pid
 # echo "Train ended"
 # sleep 90
-# kill $gpu_status_pid
+
-# echo "Gpu status ended"
+kill $gpu_status_pid
 echo "Gpu status ended"