170 lines
6.5 KiB
Python
170 lines
6.5 KiB
Python
|
import os
|
||
|
import time
|
||
|
import json
|
||
|
|
||
|
AVG_ONLY = False
|
||
|
TRAIN_TYPE = "inference"
|
||
|
|
||
|
RESULTS_PATH = r"C:\Users\wengq\Desktop\QY项目\910b_results\final"
|
||
|
if TRAIN_TYPE == "lora_sft":
|
||
|
MODEL_PREFIX = ["Qwen-7B", "Llama2-7B", "ChatGLM2-6B", "Baichuan2-7B"]
|
||
|
else:
|
||
|
MODEL_PREFIX = ["Qwen-7B", "Llama2-7B", "Baichuan2-7B"]
|
||
|
|
||
|
def get_train_result(step500):
|
||
|
all_total_sec = []
|
||
|
all_step_sec = []
|
||
|
all_token_per_sec = []
|
||
|
|
||
|
train_cnt = 0
|
||
|
|
||
|
for model_prefix in MODEL_PREFIX:
|
||
|
for single in range(2):
|
||
|
total_sec = []
|
||
|
step_sec = []
|
||
|
token_per_sec = []
|
||
|
|
||
|
train_cnt = 0
|
||
|
gpu_cnt = 1 if single else 8
|
||
|
step = 500 if step500 else 1000
|
||
|
folder_prefix = TRAIN_TYPE + "_" + model_prefix + "_" + str(gpu_cnt) + "_gpu_" + str(step)
|
||
|
file_prefix = "train" if TRAIN_TYPE == "lora_sft" else "predict"
|
||
|
for dir_name in os.listdir(RESULTS_PATH):
|
||
|
if dir_name.startswith(folder_prefix):
|
||
|
#print(dir_name)
|
||
|
train_cnt += 1
|
||
|
with open(os.path.join(RESULTS_PATH, dir_name, file_prefix + "_results.json"), 'r', encoding='utf-8') as file:
|
||
|
train_results = json.load(file)
|
||
|
# print(train_results)
|
||
|
total_sec.append(train_results[file_prefix + '_runtime'])
|
||
|
if TRAIN_TYPE == "lora_sft":
|
||
|
step_sec.append(train_results['train_runtime'] / (500 if step500 else 1000))
|
||
|
token_per_sec.append(train_results['train_tokens_per_second'])
|
||
|
|
||
|
if AVG_ONLY:
|
||
|
total_sec = [sum(total_sec) / train_cnt]
|
||
|
if TRAIN_TYPE == "lora_sft":
|
||
|
step_sec = [sum(step_sec) / train_cnt]
|
||
|
token_per_sec = [sum(token_per_sec) / train_cnt]
|
||
|
else:
|
||
|
total_sec.append(sum(total_sec) / train_cnt)
|
||
|
if TRAIN_TYPE == "lora_sft":
|
||
|
step_sec.append(sum(step_sec) / train_cnt)
|
||
|
token_per_sec.append(sum(token_per_sec) / train_cnt)
|
||
|
|
||
|
all_total_sec.extend(total_sec)
|
||
|
all_step_sec.extend(step_sec)
|
||
|
all_token_per_sec.extend(token_per_sec)
|
||
|
|
||
|
result = ",".join(map(str, all_total_sec)) + "\n" + ",".join(map(str, all_step_sec)) + "\n" + ",".join(map(str, all_token_per_sec)) + "\n"
|
||
|
print(result)
|
||
|
return result
|
||
|
|
||
|
def get_detail_folder_path(model_folder, model_prefix, run_no, single, step500):
|
||
|
detail_folder = model_prefix + "_" + TRAIN_TYPE + "_" + str(run_no) + ("_single" if single else "") + ("_step500" if step500 else "")
|
||
|
return os.path.join(RESULTS_PATH, TRAIN_TYPE, model_folder, detail_folder)
|
||
|
|
||
|
def get_train_start_end_time(model_folder, model_prefix, run_no, single, step500):
|
||
|
trainer_log_path = os.path.join(get_detail_folder_path(model_folder, model_prefix, run_no, single, step500), "trainer_log.jsonl")
|
||
|
|
||
|
start_time = time.strptime(get_first_json(trainer_log_path)['cur_time'], "%Y-%m-%d %H:%M:%S")
|
||
|
end_time = time.strptime(get_last_json(trainer_log_path)['cur_time'], "%Y-%m-%d %H:%M:%S")
|
||
|
|
||
|
return start_time, end_time
|
||
|
|
||
|
|
||
|
def get_gpu_result(step500):
|
||
|
all_mem = []
|
||
|
all_power_consumption = []
|
||
|
|
||
|
for model_prefix in MODEL_PREFIX:
|
||
|
for single in range(2):
|
||
|
all_run_max_mem = []
|
||
|
all_run_avg_power_consumption = []
|
||
|
|
||
|
train_cnt = 0
|
||
|
gpu_cnt = 1 if single else 8
|
||
|
step = 500 if step500 else 1000
|
||
|
folder_prefix = TRAIN_TYPE + "_" + model_prefix + "_" + str(gpu_cnt) + "_gpu_" + str(step)
|
||
|
|
||
|
for dir_name in os.listdir(RESULTS_PATH):
|
||
|
if dir_name.startswith(folder_prefix):
|
||
|
#print(dir_name)
|
||
|
train_cnt += 1
|
||
|
|
||
|
max_mems = []
|
||
|
power_consumptions = []
|
||
|
for file_name in os.listdir(os.path.join(RESULTS_PATH, dir_name)):
|
||
|
if file_name.startswith("npu_status"):
|
||
|
with open(os.path.join(RESULTS_PATH, dir_name,file_name), 'r', encoding='utf-8') as file:
|
||
|
for line in file:
|
||
|
#print(line)
|
||
|
gpu_results = json.loads(line)
|
||
|
max_mems.append(get_max_mem_in_multi_gpu(gpu_results))
|
||
|
power_consumptions.append(get_sum_power_consumption(gpu_results, single))
|
||
|
|
||
|
all_run_max_mem.append(max(max_mems))
|
||
|
all_run_avg_power_consumption.append(sum(power_consumptions) / len(power_consumptions))
|
||
|
|
||
|
# print(all_run_max_mem)
|
||
|
|
||
|
if AVG_ONLY:
|
||
|
all_run_max_mem = [sum(all_run_max_mem) / train_cnt]
|
||
|
all_run_avg_power_consumption = [sum(all_run_avg_power_consumption) / train_cnt]
|
||
|
|
||
|
else:
|
||
|
all_run_max_mem.append(sum(all_run_max_mem) / train_cnt)
|
||
|
all_run_avg_power_consumption.append(sum(all_run_avg_power_consumption) / train_cnt)
|
||
|
|
||
|
all_mem.extend(all_run_max_mem)
|
||
|
all_power_consumption.extend(all_run_avg_power_consumption)
|
||
|
|
||
|
|
||
|
result = ",".join(map(str, all_mem)) + "\n" + ",".join(map(str, all_power_consumption))
|
||
|
print(result)
|
||
|
return result
|
||
|
|
||
|
def get_sum_power_consumption(gpu_results, single, card_no = 0):
|
||
|
if single:
|
||
|
return gpu_results["npu_power_dissipation"][card_no]["power_dissipation"]
|
||
|
else:
|
||
|
sum = 0
|
||
|
for idx in range(7):
|
||
|
sum += gpu_results["npu_power_dissipation"][idx]["power_dissipation"]
|
||
|
|
||
|
return sum
|
||
|
|
||
|
def get_max_mem_in_multi_gpu(gpu_results):
|
||
|
max_mem_percent = 0
|
||
|
for gpu in gpu_results["device_mem_usage"]:
|
||
|
if gpu["mem_usage_percent"] > max_mem_percent:
|
||
|
max_mem_percent = gpu["mem_usage_percent"]
|
||
|
return max_mem_percent * 64
|
||
|
|
||
|
|
||
|
|
||
|
def get_first_json(jsonl_file_path):
|
||
|
first_json = None
|
||
|
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
|
||
|
first_json = json.loads(file.readline())
|
||
|
return first_json
|
||
|
|
||
|
def get_last_json(jsonl_file_path):
|
||
|
last_json = None
|
||
|
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
|
||
|
for line in file:
|
||
|
last_json = json.loads(line)
|
||
|
return last_json
|
||
|
|
||
|
|
||
|
def main():
|
||
|
get_train_result(False)
|
||
|
get_gpu_result(False)
|
||
|
|
||
|
# get_train_result(True)
|
||
|
# get_gpu_result(True)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|