LLaMA-Factory-Mirror/sort_result.py

147 lines
5.6 KiB
Python
Raw Normal View History

import os
import time
import json
AVG_ONLY = True
GPU_STATUS_FILE = "gpu_status_0824.json"
RESULTS_PATH = r"C:\Users\wengq\Desktop\QY项目\repo\LLaMA-Factory-Mirror\results"
TRAIN_TYPE = "lora_sft"
MODEL_FOLDER = ["Qwen-7B", "Llama2-7B", "ChatGLM2-6B", "Baichuan2-7B"]
MODEL_PREFIX = ["Qwen", "llama2", "ChatGLM2", "Baichuan2"]
def get_train_result(step500):
all_total_sec = []
all_step_sec = []
all_token_per_sec = []
for model_idx in range(4):
for single in range(2):
total_sec = []
step_sec = []
token_per_sec = []
for run_no in range(1,4):
train_result_path = os.path.join(get_detail_folder_path(MODEL_FOLDER[model_idx], MODEL_PREFIX[model_idx], run_no, single, step500), "train_results.json")
# print(train_result_path)
with open(train_result_path, 'r', encoding='utf-8') as file:
train_results = json.load(file)
# print(train_results)
total_sec.append(train_results['train_runtime'])
step_sec.append(train_results['train_runtime'] / (500 if step500 else 1000))
token_per_sec.append(train_results['train_tokens_per_second'])
if AVG_ONLY:
total_sec = [sum(total_sec) / 3]
step_sec = [sum(step_sec) / 3]
token_per_sec = [sum(token_per_sec) / 3]
else:
total_sec.append(sum(total_sec) / 3)
step_sec.append(sum(step_sec) / 3)
token_per_sec.append(sum(token_per_sec) / 3)
all_total_sec.extend(total_sec)
all_step_sec.extend(step_sec)
all_token_per_sec.extend(token_per_sec)
result = ",".join(map(str, all_total_sec)) + "\n" + ",".join(map(str, all_step_sec)) + "\n" + ",".join(map(str, all_token_per_sec)) + "\n"
print(result)
return result
def get_detail_folder_path(model_folder, model_prefix, run_no, single, step500):
detail_folder = model_prefix + "_" + TRAIN_TYPE + "_" + str(run_no) + ("_single" if single else "") + ("_step500" if step500 else "")
return os.path.join(RESULTS_PATH, TRAIN_TYPE, model_folder, detail_folder)
def get_train_start_end_time(model_folder, model_prefix, run_no, single, step500):
trainer_log_path = os.path.join(get_detail_folder_path(model_folder, model_prefix, run_no, single, step500), "trainer_log.jsonl")
start_time = time.strptime(get_first_json(trainer_log_path)['cur_time'], "%Y-%m-%d %H:%M:%S")
end_time = time.strptime(get_last_json(trainer_log_path)['cur_time'], "%Y-%m-%d %H:%M:%S")
return start_time, end_time
def get_gpu_result(step500):
all_mem = []
all_power_consumption = []
for model_idx in range(4):
for single in range(2):
all_run_max_mem = []
all_run_avg_power_consumption = []
for run_no in range(1,4):
start_time, end_time = get_train_start_end_time(MODEL_FOLDER[model_idx], MODEL_PREFIX[model_idx], run_no, single, step500)
max_mems = []
power_consumptions = []
gpu_result_path = os.path.join(RESULTS_PATH, "gpu_status", GPU_STATUS_FILE)
with open(gpu_result_path, 'r', encoding='utf-8') as file:
for line in file:
gpu_results = json.loads(line)
cur_time = time.strptime(gpu_results["cur_time"], "%Y-%m-%d %H:%M:%S")
if cur_time >= start_time and cur_time <= end_time:
max_mems.append(get_max_mem_in_multi_gpu(gpu_results))
power_consumptions.append(get_sum_power_consumption(gpu_results, single))
elif cur_time > end_time:
break
all_run_max_mem.append(max(max_mems))
all_run_avg_power_consumption.append(sum(power_consumptions) / len(power_consumptions))
# print(all_run_max_mem)
if AVG_ONLY:
all_run_max_mem = [sum(all_run_max_mem) / 3]
all_run_avg_power_consumption = [sum(all_run_avg_power_consumption) / 3]
else:
all_run_max_mem.append(sum(all_run_max_mem) / 3)
all_run_avg_power_consumption.append(sum(all_run_avg_power_consumption) / 3)
all_mem.extend(all_run_max_mem)
all_power_consumption.extend(all_run_avg_power_consumption)
result = ",".join(map(str, all_mem)) + "\n" + ",".join(map(str, all_power_consumption))
print(result)
return result
def get_sum_power_consumption(gpu_results, single, card_no = 0):
if single:
return gpu_results["all_gpu_status"][card_no]["powerusage_W"]
else:
sum = 0
for idx in range(7):
sum += gpu_results["all_gpu_status"][idx]["powerusage_W"]
return sum
def get_max_mem_in_multi_gpu(gpu_results):
max_mem = 0
for gpu in gpu_results["all_gpu_status"]:
if gpu["used_mem_GB"] > max_mem:
max_mem = gpu["used_mem_GB"]
return max_mem
def get_first_json(jsonl_file_path):
first_json = None
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
first_json = json.loads(file.readline())
return first_json
def get_last_json(jsonl_file_path):
last_json = None
with open(jsonl_file_path, 'r', encoding='utf-8') as file:
for line in file:
last_json = json.loads(line)
return last_json
def main():
# get_train_result(False)
# get_gpu_result(False)
get_train_result(True)
get_gpu_result(True)
if __name__ == "__main__":
main()