import os import time import json AVG_ONLY = True GPU_STATUS_FILE = "gpu_status_0824.json" RESULTS_PATH = r"C:\Users\wengq\Desktop\QY项目\repo\LLaMA-Factory-Mirror\results" TRAIN_TYPE = "lora_sft" MODEL_FOLDER = ["Qwen-7B", "Llama2-7B", "ChatGLM2-6B", "Baichuan2-7B"] MODEL_PREFIX = ["Qwen", "llama2", "ChatGLM2", "Baichuan2"] def get_train_result(step500): all_total_sec = [] all_step_sec = [] all_token_per_sec = [] for model_idx in range(4): for single in range(2): total_sec = [] step_sec = [] token_per_sec = [] for run_no in range(1,4): train_result_path = os.path.join(get_detail_folder_path(MODEL_FOLDER[model_idx], MODEL_PREFIX[model_idx], run_no, single, step500), "train_results.json") # print(train_result_path) with open(train_result_path, 'r', encoding='utf-8') as file: train_results = json.load(file) # print(train_results) total_sec.append(train_results['train_runtime']) step_sec.append(train_results['train_runtime'] / (500 if step500 else 1000)) token_per_sec.append(train_results['train_tokens_per_second']) if AVG_ONLY: total_sec = [sum(total_sec) / 3] step_sec = [sum(step_sec) / 3] token_per_sec = [sum(token_per_sec) / 3] else: total_sec.append(sum(total_sec) / 3) step_sec.append(sum(step_sec) / 3) token_per_sec.append(sum(token_per_sec) / 3) all_total_sec.extend(total_sec) all_step_sec.extend(step_sec) all_token_per_sec.extend(token_per_sec) result = ",".join(map(str, all_total_sec)) + "\n" + ",".join(map(str, all_step_sec)) + "\n" + ",".join(map(str, all_token_per_sec)) + "\n" print(result) return result def get_detail_folder_path(model_folder, model_prefix, run_no, single, step500): detail_folder = model_prefix + "_" + TRAIN_TYPE + "_" + str(run_no) + ("_single" if single else "") + ("_step500" if step500 else "") return os.path.join(RESULTS_PATH, TRAIN_TYPE, model_folder, detail_folder) def get_train_start_end_time(model_folder, model_prefix, run_no, single, step500): trainer_log_path = os.path.join(get_detail_folder_path(model_folder, model_prefix, run_no, single, step500), "trainer_log.jsonl") start_time = time.strptime(get_first_json(trainer_log_path)['cur_time'], "%Y-%m-%d %H:%M:%S") end_time = time.strptime(get_last_json(trainer_log_path)['cur_time'], "%Y-%m-%d %H:%M:%S") return start_time, end_time def get_gpu_result(step500): all_mem = [] all_power_consumption = [] for model_idx in range(4): for single in range(2): all_run_max_mem = [] all_run_avg_power_consumption = [] for run_no in range(1,4): start_time, end_time = get_train_start_end_time(MODEL_FOLDER[model_idx], MODEL_PREFIX[model_idx], run_no, single, step500) max_mems = [] power_consumptions = [] gpu_result_path = os.path.join(RESULTS_PATH, "gpu_status", GPU_STATUS_FILE) with open(gpu_result_path, 'r', encoding='utf-8') as file: for line in file: gpu_results = json.loads(line) cur_time = time.strptime(gpu_results["cur_time"], "%Y-%m-%d %H:%M:%S") if cur_time >= start_time and cur_time <= end_time: max_mems.append(get_max_mem_in_multi_gpu(gpu_results)) power_consumptions.append(get_sum_power_consumption(gpu_results, single)) elif cur_time > end_time: break all_run_max_mem.append(max(max_mems)) all_run_avg_power_consumption.append(sum(power_consumptions) / len(power_consumptions)) # print(all_run_max_mem) if AVG_ONLY: all_run_max_mem = [sum(all_run_max_mem) / 3] all_run_avg_power_consumption = [sum(all_run_avg_power_consumption) / 3] else: all_run_max_mem.append(sum(all_run_max_mem) / 3) all_run_avg_power_consumption.append(sum(all_run_avg_power_consumption) / 3) all_mem.extend(all_run_max_mem) all_power_consumption.extend(all_run_avg_power_consumption) result = ",".join(map(str, all_mem)) + "\n" + ",".join(map(str, all_power_consumption)) print(result) return result def get_sum_power_consumption(gpu_results, single, card_no = 0): if single: return gpu_results["all_gpu_status"][card_no]["powerusage_W"] else: sum = 0 for idx in range(7): sum += gpu_results["all_gpu_status"][idx]["powerusage_W"] return sum def get_max_mem_in_multi_gpu(gpu_results): max_mem = 0 for gpu in gpu_results["all_gpu_status"]: if gpu["used_mem_GB"] > max_mem: max_mem = gpu["used_mem_GB"] return max_mem def get_first_json(jsonl_file_path): first_json = None with open(jsonl_file_path, 'r', encoding='utf-8') as file: first_json = json.loads(file.readline()) return first_json def get_last_json(jsonl_file_path): last_json = None with open(jsonl_file_path, 'r', encoding='utf-8') as file: for line in file: last_json = json.loads(line) return last_json def main(): # get_train_result(False) # get_gpu_result(False) get_train_result(True) get_gpu_result(True) if __name__ == "__main__": main()