import os import time import json AVG_ONLY = False TRAIN_TYPE = "inference" RESULTS_PATH = r"C:\Users\wengq\Desktop\QY项目\910b_results\final" if TRAIN_TYPE == "lora_sft": MODEL_PREFIX = ["Qwen-7B", "Llama2-7B", "ChatGLM2-6B", "Baichuan2-7B"] else: MODEL_PREFIX = ["Qwen-7B", "Llama2-7B", "Baichuan2-7B"] def get_train_result(step500): all_total_sec = [] all_step_sec = [] all_token_per_sec = [] train_cnt = 0 for model_prefix in MODEL_PREFIX: for single in range(2): total_sec = [] step_sec = [] token_per_sec = [] train_cnt = 0 gpu_cnt = 1 if single else 8 step = 500 if step500 else 1000 folder_prefix = TRAIN_TYPE + "_" + model_prefix + "_" + str(gpu_cnt) + "_gpu_" + str(step) file_prefix = "train" if TRAIN_TYPE == "lora_sft" else "predict" for dir_name in os.listdir(RESULTS_PATH): if dir_name.startswith(folder_prefix): #print(dir_name) train_cnt += 1 with open(os.path.join(RESULTS_PATH, dir_name, file_prefix + "_results.json"), 'r', encoding='utf-8') as file: train_results = json.load(file) # print(train_results) total_sec.append(train_results[file_prefix + '_runtime']) if TRAIN_TYPE == "lora_sft": step_sec.append(train_results['train_runtime'] / (500 if step500 else 1000)) token_per_sec.append(train_results['train_tokens_per_second']) if AVG_ONLY: total_sec = [sum(total_sec) / train_cnt] if TRAIN_TYPE == "lora_sft": step_sec = [sum(step_sec) / train_cnt] token_per_sec = [sum(token_per_sec) / train_cnt] else: total_sec.append(sum(total_sec) / train_cnt) if TRAIN_TYPE == "lora_sft": step_sec.append(sum(step_sec) / train_cnt) token_per_sec.append(sum(token_per_sec) / train_cnt) all_total_sec.extend(total_sec) all_step_sec.extend(step_sec) all_token_per_sec.extend(token_per_sec) result = ",".join(map(str, all_total_sec)) + "\n" + ",".join(map(str, all_step_sec)) + "\n" + ",".join(map(str, all_token_per_sec)) + "\n" print(result) return result def get_detail_folder_path(model_folder, model_prefix, run_no, single, step500): detail_folder = model_prefix + "_" + TRAIN_TYPE + "_" + str(run_no) + ("_single" if single else "") + ("_step500" if step500 else "") return os.path.join(RESULTS_PATH, TRAIN_TYPE, model_folder, detail_folder) def get_train_start_end_time(model_folder, model_prefix, run_no, single, step500): trainer_log_path = os.path.join(get_detail_folder_path(model_folder, model_prefix, run_no, single, step500), "trainer_log.jsonl") start_time = time.strptime(get_first_json(trainer_log_path)['cur_time'], "%Y-%m-%d %H:%M:%S") end_time = time.strptime(get_last_json(trainer_log_path)['cur_time'], "%Y-%m-%d %H:%M:%S") return start_time, end_time def get_gpu_result(step500): all_mem = [] all_power_consumption = [] for model_prefix in MODEL_PREFIX: for single in range(2): all_run_max_mem = [] all_run_avg_power_consumption = [] train_cnt = 0 gpu_cnt = 1 if single else 8 step = 500 if step500 else 1000 folder_prefix = TRAIN_TYPE + "_" + model_prefix + "_" + str(gpu_cnt) + "_gpu_" + str(step) for dir_name in os.listdir(RESULTS_PATH): if dir_name.startswith(folder_prefix): #print(dir_name) train_cnt += 1 max_mems = [] power_consumptions = [] for file_name in os.listdir(os.path.join(RESULTS_PATH, dir_name)): if file_name.startswith("npu_status"): with open(os.path.join(RESULTS_PATH, dir_name,file_name), 'r', encoding='utf-8') as file: for line in file: #print(line) gpu_results = json.loads(line) max_mems.append(get_max_mem_in_multi_gpu(gpu_results)) power_consumptions.append(get_sum_power_consumption(gpu_results, single)) all_run_max_mem.append(max(max_mems)) all_run_avg_power_consumption.append(sum(power_consumptions) / len(power_consumptions)) # print(all_run_max_mem) if AVG_ONLY: all_run_max_mem = [sum(all_run_max_mem) / train_cnt] all_run_avg_power_consumption = [sum(all_run_avg_power_consumption) / train_cnt] else: all_run_max_mem.append(sum(all_run_max_mem) / train_cnt) all_run_avg_power_consumption.append(sum(all_run_avg_power_consumption) / train_cnt) all_mem.extend(all_run_max_mem) all_power_consumption.extend(all_run_avg_power_consumption) result = ",".join(map(str, all_mem)) + "\n" + ",".join(map(str, all_power_consumption)) print(result) return result def get_sum_power_consumption(gpu_results, single, card_no = 0): if single: return gpu_results["npu_power_dissipation"][card_no]["power_dissipation"] else: sum = 0 for idx in range(7): sum += gpu_results["npu_power_dissipation"][idx]["power_dissipation"] return sum def get_max_mem_in_multi_gpu(gpu_results): max_mem_percent = 0 for gpu in gpu_results["device_mem_usage"]: if gpu["mem_usage_percent"] > max_mem_percent: max_mem_percent = gpu["mem_usage_percent"] return max_mem_percent * 64 def get_first_json(jsonl_file_path): first_json = None with open(jsonl_file_path, 'r', encoding='utf-8') as file: first_json = json.loads(file.readline()) return first_json def get_last_json(jsonl_file_path): last_json = None with open(jsonl_file_path, 'r', encoding='utf-8') as file: for line in file: last_json = json.loads(line) return last_json def main(): get_train_result(False) get_gpu_result(False) # get_train_result(True) # get_gpu_result(True) if __name__ == "__main__": main()