From 42d9773188e79a3414713d3f474665e2c42e8534 Mon Sep 17 00:00:00 2001 From: wql Date: Fri, 6 Sep 2024 14:03:54 +0800 Subject: [PATCH] feat: add npu status --- npu_status.sh | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++ run_once.sh | 15 ++++++------- 2 files changed, 67 insertions(+), 8 deletions(-) create mode 100644 npu_status.sh diff --git a/npu_status.sh b/npu_status.sh new file mode 100644 index 00000000..ce84a03d --- /dev/null +++ b/npu_status.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +output_dir=$1 +sleep_time=$2 +print_to_screen=$3 + +output_file_path="${output_dir}/gpu_status_$(date +"%Y%m%d%H%M%S").json" + +device_cnt=$(npu-smi info | grep '310P3' | wc -l) +chip_id_arr_from_info=(0 1 0 1) +npu_id_arr_from_info=(5 5 6 6) +npu_id_arr=(5 6) + +loop_cnt=0 + +while true; do + device_mem_usage="" + for ((i=0; i> output_file_path + sleep sleep_time + + if [ "$print_to_screen" -eq "1" ]; then + echo "$json" + fi + + loop_cnt+=1 + if [ "$loop_cnt" -eq "1500" ]; then + exit 0 + fi + +done + +get_power_dissipation() { + local npu_id="$1" + # need to varify + power_dissipation=$(npu-smi info -t power -i ${npu_id} | grep 'Power Dissipation(W)' | awk '{print $2}') + echo "${power_dissipation}" +} + +get_mem_usage() { + local chip="$1" + local device="$2" + mem_usage=$(npu-smi info | grep '${chip} ${device}' | awk '{print $8}') + echo "${mem_usage}" +} \ No newline at end of file diff --git a/run_once.sh b/run_once.sh index ce85dc49..f15ed891 100644 --- a/run_once.sh +++ b/run_once.sh @@ -28,11 +28,10 @@ python prepare_yaml_file.py ${run_type} ${model} ${max_steps} ${run_name} ${outp export USE_MODELSCOPE_HUB=1 -# echo "Start recording gpu status " -# # 0 means not printing gpu status -# python gpu_status.py ${output_dir} 1 10 & -# gpu_status_pid=$! -# echo "${gpu_status_pid}" +echo "Start recording npu status " +bash npu_status.sh ${output_dir} 10 1 & +npu_status_pid=$! +echo "${npu_status_pid}" if [ "${gpu_cnt}"="1" ]; then ASCEND_RT_VISIBLE_DEVICES=0 llamafactory-cli train ${output_dir}/${run_name}.yaml | tee "${output_dir}/log.txt" & @@ -47,6 +46,6 @@ fi wait $train_pid echo "Train ended" -# sleep 90 -# kill $gpu_status_pid -# echo "Gpu status ended" \ No newline at end of file +sleep 90 +kill $npu_status_pid +echo "Npu status ended" \ No newline at end of file