fix: modify npu status for 910b
This commit is contained in:
parent
63556d6571
commit
7a808cbb75
|
@ -4,24 +4,19 @@ output_dir=$1
|
||||||
sleep_time=$2
|
sleep_time=$2
|
||||||
print_to_screen=$3
|
print_to_screen=$3
|
||||||
|
|
||||||
output_file_path="${output_dir}/gpu_status_$(date +"%Y%m%d%H%M%S").json"
|
output_file_path="${output_dir}/npu_status_$(date +"%Y%m%d%H%M%S").json"
|
||||||
|
device_cnt=$(npu-smi info | grep '910B1' | wc -l)
|
||||||
device_cnt=$(npu-smi info | grep '310P3' | wc -l)
|
|
||||||
chip_id_arr_from_info=(0 0 0 0 0 0 0 0)
|
|
||||||
npu_id_arr_from_info=(0 1 2 3 4 5 6 7)
|
|
||||||
npu_id_arr=(0 1 2 3 4 5 6 7)
|
|
||||||
|
|
||||||
|
|
||||||
get_power_dissipation() {
|
get_power_dissipation() {
|
||||||
local npu_id="$1"
|
local npu_id="$1"
|
||||||
power_dissipation=$(npu-smi info -t power -i ${npu_id} | grep "Power Dissipation(W)" | awk '{print $4}')
|
power_dissipation=$(npu-smi info -t power -i ${npu_id} | grep "NPU Real-time Power(W)" | awk '{print $5}')
|
||||||
echo "${power_dissipation}"
|
echo "${power_dissipation}"
|
||||||
}
|
}
|
||||||
|
|
||||||
get_mem_usage() {
|
get_mem_usage() {
|
||||||
local chip="$1"
|
local npu_id="$1"
|
||||||
local device="$2"
|
mem_usage=$(npu-smi info -t usages -i ${npu_id} | grep "HBM Usage Rate(%)" | awk '{print $5}')
|
||||||
mem_usage=$(npu-smi info | grep "${chip} ${device}" | awk '{print $8}')
|
|
||||||
echo "${mem_usage}"
|
echo "${mem_usage}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,15 +27,15 @@ for i in {1..1500}
|
||||||
do
|
do
|
||||||
device_mem_usage=""
|
device_mem_usage=""
|
||||||
for ((i=0; i<device_cnt; i++)); do
|
for ((i=0; i<device_cnt; i++)); do
|
||||||
mem_usage=$(get_mem_usage ${chip_id_arr_from_info[i]} ${i})
|
mem_usage=$(get_mem_usage ${i})
|
||||||
device_mem_usage="${device_mem_usage}{\"npu_id\": ${npu_id_arr_from_info[i]}, \"chip_id\": ${chip_id_arr_from_info[i]}, \"device_id\": ${i}, \"mem_usage\": ${mem_usage}}, "
|
device_mem_usage="${device_mem_usage}{\"npu_id\": ${i}, \"mem_usage_percent\": ${mem_usage}}, "
|
||||||
done
|
done
|
||||||
device_mem_usage="${device_mem_usage%??}"
|
device_mem_usage="${device_mem_usage%??}"
|
||||||
|
|
||||||
npu_power_dissipation=""
|
npu_power_dissipation=""
|
||||||
for npu_id in "${npu_id_arr[@]}"; do
|
for ((i=0; i<device_cnt; i++)); do
|
||||||
power_dissipation=$(get_power_dissipation ${npu_id})
|
power_dissipation=$(get_power_dissipation ${i})
|
||||||
npu_power_dissipation="${npu_power_dissipation}{\"npu_id\": ${npu_id}, \"power_dissipation\": ${power_dissipation}}, "
|
npu_power_dissipation="${npu_power_dissipation}{\"npu_id\": ${i}, \"power_dissipation\": ${power_dissipation}}, "
|
||||||
done
|
done
|
||||||
npu_power_dissipation="${npu_power_dissipation%??}"
|
npu_power_dissipation="${npu_power_dissipation%??}"
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
|
|
||||||
# bash run_once.sh lora_sft Qwen-7B 8 50
|
# bash run_once.sh lora_sft Qwen-7B 8 50
|
||||||
|
# bash run_once.sh lora_sft Qwen-7B 1 50
|
||||||
# bash run_once.sh inference Qwen-7B 8 50
|
# bash run_once.sh inference Qwen-7B 8 50
|
||||||
run_type="$1"
|
run_type="$1"
|
||||||
model="$2"
|
model="$2"
|
||||||
|
@ -34,7 +35,7 @@ export USE_MODELSCOPE_HUB=1
|
||||||
echo "Start recording npu status "
|
echo "Start recording npu status "
|
||||||
bash npu_status.sh ${output_dir} 60 0 &
|
bash npu_status.sh ${output_dir} 60 0 &
|
||||||
npu_status_pid=$!
|
npu_status_pid=$!
|
||||||
echo "${npu_status_pid}"
|
# echo "${npu_status_pid}"
|
||||||
|
|
||||||
if [ "${gpu_cnt}"="1" ]; then
|
if [ "${gpu_cnt}"="1" ]; then
|
||||||
ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yaml | tee "${output_dir}/log.txt" &
|
ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yaml | tee "${output_dir}/log.txt" &
|
||||||
|
|
Loading…
Reference in New Issue