LLaMA-Factory-310P3/run_once.sh

#!/bin/bash


# bash run_once.sh lora_sft Qwen-7B 8 50
# bash run_once.sh lora_sft Qwen-7B 1 50
# bash run_once.sh lora_sft 9g-8B 8 500
# bash run_once.sh inference Qwen-7B 8 50
run_type="$1"
model="$2"
gpu_cnt="$3"
max_steps="$4"

echo "run_type: ${run_type}"
echo "model: ${model}"
echo "gpu_cnt: ${gpu_cnt}"
echo "max_steps: ${max_steps}"

current_datetime=$(date +%Y%m%d%H%M%S)

source /usr/local/Ascend/ascend-toolkit/set_env.sh

if [ "${run_type}"="lora_sft" ]; then
    run_name="${run_type}_${model}_${gpu_cnt}_gpu_${max_steps}_step_${current_datetime}"
    
else
    run_name="${run_type}_${model}_${gpu_cnt}_gpu_${current_datetime}"
fi

output_dir="./results/910b/${run_name}"

if [ ! -d "$output_dir" ]; then
    mkdir -p "$output_dir"
    echo "output_dir created: $output_dir"
else
    echo "output_dir exists: $output_dir"
fi

# echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}"
python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir}

export USE_MODELSCOPE_HUB=1

echo "Start recording npu status "
bash npu_status.sh ${output_dir} 60 0 &
npu_status_pid=$!
# echo "${npu_status_pid}"

if [ "${gpu_cnt}" = "1" ]; then
   ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yaml 2>&1 | tee -i "${output_dir}/log.txt" &
   train_pid=$!
   echo "Start single npu train"
else
   FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yaml 2>&1 | tee -i "${output_dir}/log.txt" &
   train_pid=$!
   echo "Start multi npu train"
fi

wait $train_pid
train_status=$?
echo "Train ended"

sleep 60
kill $npu_status_pid
echo "Npu status ended"

echo "train_status： ${train_status}"
if [ $train_status -ne 0 ]; then
    output_dir="${output_dir}_fail"
fi

python3 -c "import moxing as mox; mox.file.copy_parallel('${output_dir}', 'obs://xty/results/${run_name}')"
echo "${run_name} uploaded"
rm -r ${output_dir}
-												fix: fix first line

											
										
										
											2024-09-05 12:54:15 +08:00
+								#!/bin/bash
-												feat: add cur_time to log

											
										
										
											2024-09-18 16:11:43 +08:00
 								# bash run_once.sh lora_sft Qwen-7B 8 50
-												fix: modify npu status for 910b

											
										
										
											2024-09-18 17:14:01 +08:00
+								# bash run_once.sh lora_sft Qwen-7B 1 50
-												chore: move code

											
										
										
											2024-10-23 20:35:24 +08:00
+								# bash run_once.sh lora_sft 9g-8B 8 500
-												chore: change npu id array

											
										
										
											2024-09-18 16:18:37 +08:00
+								# bash run_once.sh inference Qwen-7B 8 50
-												fix: fix space

											
										
										
											2024-09-05 12:59:02 +08:00
+								run_type="$1"
 								model="$2"
 								gpu_cnt="$3"
 								max_steps="$4"
-												feat: done easy run

											
										
										
											2024-09-05 11:28:19 +08:00
-												chore: add echo

											
										
										
											2024-09-19 15:30:31 +08:00
+								echo "run_type: ${run_type}"
 								echo "model: ${model}"
 								echo "gpu_cnt: ${gpu_cnt}"
 								echo "max_steps: ${max_steps}"
-												feat: done easy run

											
										
										
											2024-09-05 11:28:19 +08:00
+								current_datetime=$(date +%Y%m%d%H%M%S)
-												chore: move code

											
										
										
											2024-10-23 20:35:24 +08:00
+								source /usr/local/Ascend/ascend-toolkit/set_env.sh
-												fix: fix space

											
										
										
											2024-09-05 12:59:02 +08:00
+								if [ "${run_type}"="lora_sft" ]; then
-												feat: done easy run

											
										
										
											2024-09-05 11:28:19 +08:00
+								    run_name="${run_type}_${model}_${gpu_cnt}_gpu_${max_steps}_step_${current_datetime}"
 								else
 								    run_name="${run_type}_${model}_${gpu_cnt}_gpu_${current_datetime}"
 								fi
-												chore: change output dir

											
										
										
											2024-09-19 13:54:43 +08:00
+								output_dir="./results/910b/${run_name}"
-												feat: done easy run

											
										
										
											2024-09-05 11:28:19 +08:00
 								if [ ! -d "$output_dir" ]; then
 								    mkdir -p "$output_dir"
-												chore: add echo

											
										
										
											2024-09-05 13:09:43 +08:00
+								    echo "output_dir created: $output_dir"
-												feat: done easy run

											
										
										
											2024-09-05 11:28:19 +08:00
+								else
-												chore: add echo

											
										
										
											2024-09-05 13:09:43 +08:00
+								    echo "output_dir exists: $output_dir"
-												feat: done easy run

											
										
										
											2024-09-05 11:28:19 +08:00
+								fi
-												chore: add echo

											
										
										
											2024-09-05 13:09:43 +08:00
+								# echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}"
 								python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir}
-												feat: done easy run

											
										
										
											2024-09-05 11:28:19 +08:00
-												fix: small change

											
										
										
											2024-09-05 13:37:17 +08:00
+								export USE_MODELSCOPE_HUB=1
-												feat: done easy run

											
										
										
											2024-09-05 11:28:19 +08:00
-												chore: reverse test

											
										
										
											2024-09-19 16:57:15 +08:00
+								echo "Start recording npu status "
 								bash npu_status.sh ${output_dir} 60 0 &
 								npu_status_pid=$!
 								# echo "${npu_status_pid}"
 								if [ "${gpu_cnt}" = "1" ]; then
-												chore: modify tee

											
										
										
											2024-09-24 09:41:24 +08:00
+								   ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yaml 2>&1 | tee -i "${output_dir}/log.txt" &
-												chore: reverse test

											
										
										
											2024-09-19 16:57:15 +08:00
+								   train_pid=$!
 								   echo "Start single npu train"
 								else
-												chore: modify tee

											
										
										
											2024-09-24 09:41:24 +08:00
+								   FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yaml 2>&1 | tee -i "${output_dir}/log.txt" &
-												chore: reverse test

											
										
										
											2024-09-19 16:57:15 +08:00
+								   train_pid=$!
 								   echo "Start multi npu train"
 								fi
 								wait $train_pid
-												feat: add fail mark for fail process

											
										
										
											2024-09-24 09:32:12 +08:00
+								train_status=$?
-												chore: reverse test

											
										
										
											2024-09-19 16:57:15 +08:00
+								echo "Train ended"
 								sleep 60
 								kill $npu_status_pid
 								echo "Npu status ended"
-												test: test moxing

											
										
										
											2024-09-19 16:54:59 +08:00
-												chore: add echo

											
										
										
											2024-09-25 09:57:05 +08:00
+								echo "train_status： ${train_status}"
-												feat: add fail mark for fail process

											
										
										
											2024-09-24 09:32:12 +08:00
+								if [ $train_status -ne 0 ]; then
 								    output_dir="${output_dir}_fail"
 								fi
-												test: test moxing

											
										
										
											2024-09-19 16:54:59 +08:00
+								python3 -c "import moxing as mox; mox.file.copy_parallel('${output_dir}', 'obs://xty/results/${run_name}')"
-												chore: add print run name

											
										
										
											2024-10-24 10:51:37 +08:00
+								echo "${run_name} uploaded"
-												test: test moxing

											
										
										
											2024-09-19 16:54:59 +08:00
+								rm -r ${output_dir}