LLaMA-Factory-310P3/run_once.sh

69 lines
1.7 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# bash run_once.sh lora_sft Qwen-7B 8 50
# bash run_once.sh lora_sft Qwen-7B 1 50
# bash run_once.sh inference Qwen-7B 8 50
run_type="$1"
model="$2"
gpu_cnt="$3"
max_steps="$4"
echo "run_type: ${run_type}"
echo "model: ${model}"
echo "gpu_cnt: ${gpu_cnt}"
echo "max_steps: ${max_steps}"
current_datetime=$(date +%Y%m%d%H%M%S)
if [ "${run_type}"="lora_sft" ]; then
run_name="${run_type}_${model}_${gpu_cnt}_gpu_${max_steps}_step_${current_datetime}"
else
run_name="${run_type}_${model}_${gpu_cnt}_gpu_${current_datetime}"
fi
output_dir="./results/910b/${run_name}"
if [ ! -d "$output_dir" ]; then
mkdir -p "$output_dir"
echo "output_dir created: $output_dir"
else
echo "output_dir exists: $output_dir"
fi
# echo "${run_type} ${model} ${gpu_cnt} ${max_steps} ${run_name} ${output_dir}"
python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${output_dir}
export USE_MODELSCOPE_HUB=1
echo "Start recording npu status "
bash npu_status.sh ${output_dir} 60 0 &
npu_status_pid=$!
# echo "${npu_status_pid}"
if [ "${gpu_cnt}" = "1" ]; then
ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yaml 2>&1 | tee -i "${output_dir}/log.txt" &
train_pid=$!
echo "Start single npu train"
else
FORCE_TORCHRUN=1 llamafactory-cli train ${output_dir}/${run_name}.yaml 2>&1 | tee -i "${output_dir}/log.txt" &
train_pid=$!
echo "Start multi npu train"
fi
wait $train_pid
train_status=$?
echo "Train ended"
sleep 60
kill $npu_status_pid
echo "Npu status ended"
echo "train_status ${train_status}"
if [ $train_status -ne 0 ]; then
output_dir="${output_dir}_fail"
fi
python3 -c "import moxing as mox; mox.file.copy_parallel('${output_dir}', 'obs://xty/results/${run_name}')"
rm -r ${output_dir}