diff --git a/run_once.sh b/run_once.sh index 493c0f60..56a78b10 100644 --- a/run_once.sh +++ b/run_once.sh @@ -37,27 +37,27 @@ python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${outp export USE_MODELSCOPE_HUB=1 -# echo "Start recording npu status " -# bash npu_status.sh ${output_dir} 60 0 & -# npu_status_pid=$! -# # echo "${npu_status_pid}" +echo "Start recording npu status " +bash npu_status.sh ${output_dir} 60 0 & +npu_status_pid=$! +# echo "${npu_status_pid}" -# if [ "${gpu_cnt}" = "1" ]; then -# ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yaml | tee "${output_dir}/log.txt" & -# train_pid=$! -# echo "Start single npu train" -# else -# FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yaml | tee "${output_dir}/log.txt" & -# train_pid=$! -# echo "Start multi npu train" -# fi +if [ "${gpu_cnt}" = "1" ]; then + ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yaml | tee "${output_dir}/log.txt" & + train_pid=$! + echo "Start single npu train" +else + FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yaml | tee "${output_dir}/log.txt" & + train_pid=$! + echo "Start multi npu train" +fi -# wait $train_pid -# echo "Train ended" +wait $train_pid +echo "Train ended" -# sleep 60 -# kill $npu_status_pid -# echo "Npu status ended" +sleep 60 +kill $npu_status_pid +echo "Npu status ended" python3 -c "import moxing as mox; mox.file.copy_parallel('${output_dir}', 'obs://xty/results/${run_name}')" rm -r ${output_dir} \ No newline at end of file