Compare commits
3 Commits
9bbf989502
...
6f1af13e03
Author | SHA1 | Date |
---|---|---|
wql | 6f1af13e03 | |
wql | ccbea71b65 | |
wql | 42d9773188 |
|
@ -167,3 +167,4 @@ config/
|
|||
saves/
|
||||
output/
|
||||
wandb/
|
||||
.vscode
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
#!/bin/bash
|
||||
|
||||
output_dir=$1
|
||||
sleep_time=$2
|
||||
print_to_screen=$3
|
||||
|
||||
output_file_path="${output_dir}/gpu_status_$(date +"%Y%m%d%H%M%S").json"
|
||||
|
||||
device_cnt=$(npu-smi info | grep '310P3' | wc -l)
|
||||
chip_id_arr_from_info=(0 1 0 1)
|
||||
npu_id_arr_from_info=(5 5 6 6)
|
||||
npu_id_arr=(5 6)
|
||||
|
||||
loop_cnt=0
|
||||
|
||||
while true; do
|
||||
device_mem_usage=""
|
||||
for ((i=0; i<device_cnt; i++)); do
|
||||
mem_usage=$(get_mem_usage ${chip_id_arr_from_info[i]} ${i})
|
||||
device_mem_usage="${device_mem_usage}{"npu_id": ${npu_id_arr_from_info[i]}, "chip_id": ${chip_id_arr_from_info[i]}, "device_id": ${i}, "mem_usage": ${mem_usage}}, "
|
||||
done
|
||||
device_mem_usage="${device_mem_usage%??}"
|
||||
|
||||
npu_power_dissipation=""
|
||||
for npu_id in "${npu_id_arr[@]}"; do
|
||||
power_dissipation=$(get_power_dissipation ${npu_id})
|
||||
npu_power_dissipation="${npu_power_dissipation}{"npu_id": ${npu_id}, "power_dissipation": ${power_dissipation}}, "
|
||||
done
|
||||
npu_power_dissipation="${npu_power_dissipation%??}"
|
||||
|
||||
cur_time=$(date +"%Y-%m-%d %H:%M:%S")
|
||||
|
||||
json="{\"cur_time\": \"${cur_time}\", \"npu_power_dissipation\": [${npu_power_dissipation}], \"device_mem_usage\": [${device_mem_usage}]}"
|
||||
echo "$json" >> output_file_path
|
||||
sleep sleep_time
|
||||
|
||||
if [ "$print_to_screen" -eq "1" ]; then
|
||||
echo "$json"
|
||||
fi
|
||||
|
||||
loop_cnt+=1
|
||||
if [ "$loop_cnt" -eq "1500" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
done
|
||||
|
||||
get_power_dissipation() {
|
||||
local npu_id="$1"
|
||||
# need to varify
|
||||
power_dissipation=$(npu-smi info -t power -i ${npu_id} | grep 'Power Dissipation(W)' | awk '{print $2}')
|
||||
echo "${power_dissipation}"
|
||||
}
|
||||
|
||||
get_mem_usage() {
|
||||
local chip="$1"
|
||||
local device="$2"
|
||||
mem_usage=$(npu-smi info | grep '${chip} ${device}' | awk '{print $8}')
|
||||
echo "${mem_usage}"
|
||||
}
|
|
@ -17,7 +17,7 @@ def main():
|
|||
template = ""
|
||||
if model == "9g-8B":
|
||||
model_name_or_path = "../../models/sft_8b_v2"
|
||||
template = ""
|
||||
template = "default"
|
||||
elif model == "Baichuan2-7B":
|
||||
model_name_or_path = "../../models/Baichuan2-7B-Base"
|
||||
template = "baichuan2"
|
||||
|
|
15
run_once.sh
15
run_once.sh
|
@ -28,11 +28,10 @@ python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${outp
|
|||
|
||||
export USE_MODELSCOPE_HUB=1
|
||||
|
||||
# echo "Start recording gpu status "
|
||||
# # 0 means not printing gpu status
|
||||
# python gpu_status.py ${output_dir} 1 10 &
|
||||
# gpu_status_pid=$!
|
||||
# echo "${gpu_status_pid}"
|
||||
echo "Start recording npu status "
|
||||
bash npu_status.sh ${output_dir} 10 1 &
|
||||
npu_status_pid=$!
|
||||
echo "${npu_status_pid}"
|
||||
|
||||
if [ "${gpu_cnt}"="1" ]; then
|
||||
ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yaml | tee "${output_dir}/log.txt" &
|
||||
|
@ -47,6 +46,6 @@ fi
|
|||
wait $train_pid
|
||||
echo "Train ended"
|
||||
|
||||
# sleep 90
|
||||
# kill $gpu_status_pid
|
||||
# echo "Gpu status ended"
|
||||
sleep 90
|
||||
kill $npu_status_pid
|
||||
echo "Npu status ended"
|
Loading…
Reference in New Issue