feat: add finish add log and gpu status

Merge branch 'main' of https://osredm.com/p04798526/LLaMA-Factory-Mirror
2024-08-20 14:31:29 +08:00 · 2024-08-20 14:30:32 +08:00 · 2024-08-20 13:52:46 +08:00 · 2024-08-20 10:35:46 +08:00
2 changed files with 38 additions and 0 deletions
--- a/src/gpu_status.py
+++ b/src/gpu_status.py
@ -0,0 +1,37 @@
 import json
 import pynvml 
 import time
 def main():
    UNIT = 1024 * 1024 * 1024
    pynvml.nvmlInit()
    gpuDeviceCount = pynvml.nvmlDeviceGetCount()
    start_time = time.time()
    while time.time() - start_time < 3600 *16:
        # print(time.time() - start_time)
        all_gpu_status = []
        for i in range(gpuDeviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
            all_gpu_status.append(dict(
                device = i,
                total_mem_GB = memoryInfo.total/UNIT,
                used_mem_GB = memoryInfo.used/UNIT,
                powerusage_W = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000
            ))
        logs = dict(
            cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            all_gpu_status = all_gpu_status
        )
        with open("./results/gpu_status.json", "a", encoding="utf-8") as f:
            f.write(json.dumps(logs) + "\n")
        # print(logs)
        time.sleep(60)
    pynvml.nvmlShutdown()
 if __name__ == "__main__":
    main()
--- a/src/llamafactory/train/callbacks.py
+++ b/src/llamafactory/train/callbacks.py
@ -302,6 +302,7 @@ class LogCallback(TrainerCallback):
            learning_rate=state.log_history[-1].get("learning_rate", None),
            epoch=state.log_history[-1].get("epoch", None),
            percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
            cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            elapsed_time=self.elapsed_time,
            remaining_time=self.remaining_time,
            throughput="{:.2f}".format(state.num_input_tokens_seen / (time.time() - self.start_time)),
Author	SHA1	Message	Date
wql	07b328ee23	feat: add finish add log and gpu status	2024-08-20 14:31:29 +08:00
wql	abf6ab0743	Merge branch 'main' of https://osredm.com/p04798526/LLaMA-Factory-Mirror	2024-08-20 14:30:32 +08:00
wql	39e97a5c5f	Merge branch 'main' of https://osredm.com/p04798526/LLaMA-Factory-Mirror	2024-08-20 13:52:46 +08:00
wql	0ab6f2836b	feat: add cur_time to log	2024-08-20 10:35:46 +08:00