From e72ed9c2ca08e4ee0585ad845058b153ffd920d4 Mon Sep 17 00:00:00 2001 From: wql Date: Sat, 12 Oct 2024 10:20:03 +0800 Subject: [PATCH] feat: add sort result --- sort_result.py | 169 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 sort_result.py diff --git a/sort_result.py b/sort_result.py new file mode 100644 index 00000000..1cbc44e0 --- /dev/null +++ b/sort_result.py @@ -0,0 +1,169 @@ +import os +import time +import json + +AVG_ONLY = False +TRAIN_TYPE = "inference" + +RESULTS_PATH = r"C:\Users\wengq\Desktop\QY项目\910b_results\final" +if TRAIN_TYPE == "lora_sft": + MODEL_PREFIX = ["Qwen-7B", "Llama2-7B", "ChatGLM2-6B", "Baichuan2-7B"] +else: + MODEL_PREFIX = ["Qwen-7B", "Llama2-7B", "Baichuan2-7B"] + +def get_train_result(step500): + all_total_sec = [] + all_step_sec = [] + all_token_per_sec = [] + + train_cnt = 0 + + for model_prefix in MODEL_PREFIX: + for single in range(2): + total_sec = [] + step_sec = [] + token_per_sec = [] + + train_cnt = 0 + gpu_cnt = 1 if single else 8 + step = 500 if step500 else 1000 + folder_prefix = TRAIN_TYPE + "_" + model_prefix + "_" + str(gpu_cnt) + "_gpu_" + str(step) + file_prefix = "train" if TRAIN_TYPE == "lora_sft" else "predict" + for dir_name in os.listdir(RESULTS_PATH): + if dir_name.startswith(folder_prefix): + #print(dir_name) + train_cnt += 1 + with open(os.path.join(RESULTS_PATH, dir_name, file_prefix + "_results.json"), 'r', encoding='utf-8') as file: + train_results = json.load(file) + # print(train_results) + total_sec.append(train_results[file_prefix + '_runtime']) + if TRAIN_TYPE == "lora_sft": + step_sec.append(train_results['train_runtime'] / (500 if step500 else 1000)) + token_per_sec.append(train_results['train_tokens_per_second']) + + if AVG_ONLY: + total_sec = [sum(total_sec) / train_cnt] + if TRAIN_TYPE == "lora_sft": + step_sec = [sum(step_sec) / train_cnt] + token_per_sec = [sum(token_per_sec) / train_cnt] + else: + total_sec.append(sum(total_sec) / train_cnt) + if TRAIN_TYPE == "lora_sft": + step_sec.append(sum(step_sec) / train_cnt) + token_per_sec.append(sum(token_per_sec) / train_cnt) + + all_total_sec.extend(total_sec) + all_step_sec.extend(step_sec) + all_token_per_sec.extend(token_per_sec) + + result = ",".join(map(str, all_total_sec)) + "\n" + ",".join(map(str, all_step_sec)) + "\n" + ",".join(map(str, all_token_per_sec)) + "\n" + print(result) + return result + +def get_detail_folder_path(model_folder, model_prefix, run_no, single, step500): + detail_folder = model_prefix + "_" + TRAIN_TYPE + "_" + str(run_no) + ("_single" if single else "") + ("_step500" if step500 else "") + return os.path.join(RESULTS_PATH, TRAIN_TYPE, model_folder, detail_folder) + +def get_train_start_end_time(model_folder, model_prefix, run_no, single, step500): + trainer_log_path = os.path.join(get_detail_folder_path(model_folder, model_prefix, run_no, single, step500), "trainer_log.jsonl") + + start_time = time.strptime(get_first_json(trainer_log_path)['cur_time'], "%Y-%m-%d %H:%M:%S") + end_time = time.strptime(get_last_json(trainer_log_path)['cur_time'], "%Y-%m-%d %H:%M:%S") + + return start_time, end_time + + +def get_gpu_result(step500): + all_mem = [] + all_power_consumption = [] + + for model_prefix in MODEL_PREFIX: + for single in range(2): + all_run_max_mem = [] + all_run_avg_power_consumption = [] + + train_cnt = 0 + gpu_cnt = 1 if single else 8 + step = 500 if step500 else 1000 + folder_prefix = TRAIN_TYPE + "_" + model_prefix + "_" + str(gpu_cnt) + "_gpu_" + str(step) + + for dir_name in os.listdir(RESULTS_PATH): + if dir_name.startswith(folder_prefix): + #print(dir_name) + train_cnt += 1 + + max_mems = [] + power_consumptions = [] + for file_name in os.listdir(os.path.join(RESULTS_PATH, dir_name)): + if file_name.startswith("npu_status"): + with open(os.path.join(RESULTS_PATH, dir_name,file_name), 'r', encoding='utf-8') as file: + for line in file: + #print(line) + gpu_results = json.loads(line) + max_mems.append(get_max_mem_in_multi_gpu(gpu_results)) + power_consumptions.append(get_sum_power_consumption(gpu_results, single)) + + all_run_max_mem.append(max(max_mems)) + all_run_avg_power_consumption.append(sum(power_consumptions) / len(power_consumptions)) + + # print(all_run_max_mem) + + if AVG_ONLY: + all_run_max_mem = [sum(all_run_max_mem) / train_cnt] + all_run_avg_power_consumption = [sum(all_run_avg_power_consumption) / train_cnt] + + else: + all_run_max_mem.append(sum(all_run_max_mem) / train_cnt) + all_run_avg_power_consumption.append(sum(all_run_avg_power_consumption) / train_cnt) + + all_mem.extend(all_run_max_mem) + all_power_consumption.extend(all_run_avg_power_consumption) + + + result = ",".join(map(str, all_mem)) + "\n" + ",".join(map(str, all_power_consumption)) + print(result) + return result + +def get_sum_power_consumption(gpu_results, single, card_no = 0): + if single: + return gpu_results["npu_power_dissipation"][card_no]["power_dissipation"] + else: + sum = 0 + for idx in range(7): + sum += gpu_results["npu_power_dissipation"][idx]["power_dissipation"] + + return sum + +def get_max_mem_in_multi_gpu(gpu_results): + max_mem_percent = 0 + for gpu in gpu_results["device_mem_usage"]: + if gpu["mem_usage_percent"] > max_mem_percent: + max_mem_percent = gpu["mem_usage_percent"] + return max_mem_percent * 64 + + + +def get_first_json(jsonl_file_path): + first_json = None + with open(jsonl_file_path, 'r', encoding='utf-8') as file: + first_json = json.loads(file.readline()) + return first_json + +def get_last_json(jsonl_file_path): + last_json = None + with open(jsonl_file_path, 'r', encoding='utf-8') as file: + for line in file: + last_json = json.loads(line) + return last_json + + +def main(): + get_train_result(False) + get_gpu_result(False) + + # get_train_result(True) + # get_gpu_result(True) + + +if __name__ == "__main__": + main()