feat: add finish add log and gpu status

Merge branch 'main' of https://osredm.com/p04798526/LLaMA-Factory-Mirror
2024-08-20 14:31:29 +08:00 · 2024-08-20 14:30:32 +08:00 · 2024-08-20 13:52:46 +08:00 · 2024-08-20 10:35:46 +08:00
2 changed files with 38 additions and 0 deletions
--- a/src/gpu_status.py
+++ b/src/gpu_status.py
@ -0,0 +1,37 @@
+import json
+import pynvml 
+import time
+
+def main():
+    UNIT = 1024 * 1024 * 1024
+
+    pynvml.nvmlInit()
+    gpuDeviceCount = pynvml.nvmlDeviceGetCount()
+    start_time = time.time()
+
+    while time.time() - start_time < 3600 *16:
+        # print(time.time() - start_time)
+        all_gpu_status = []
+        for i in range(gpuDeviceCount):
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            all_gpu_status.append(dict(
+                device = i,
+                total_mem_GB = memoryInfo.total/UNIT,
+                used_mem_GB = memoryInfo.used/UNIT,
+                powerusage_W = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000
+            ))
+        
+        logs = dict(
+            cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
+            all_gpu_status = all_gpu_status
+        )
+        with open("./results/gpu_status.json", "a", encoding="utf-8") as f:
+            f.write(json.dumps(logs) + "\n")
+        # print(logs)
+        time.sleep(60)
+    pynvml.nvmlShutdown()
+
+
+if __name__ == "__main__":
+    main()
--- a/src/llamafactory/train/callbacks.py
+++ b/src/llamafactory/train/callbacks.py
@ -302,6 +302,7 @@ class LogCallback(TrainerCallback):
            learning_rate=state.log_history[-1].get("learning_rate", None),
            epoch=state.log_history[-1].get("epoch", None),
            percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
+            cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            elapsed_time=self.elapsed_time,
            remaining_time=self.remaining_time,
            throughput="{:.2f}".format(state.num_input_tokens_seen / (time.time() - self.start_time)),
Author	SHA1	Message	Date
wql	07b328ee23	feat: add finish add log and gpu status	2024-08-20 14:31:29 +08:00
wql	abf6ab0743	Merge branch 'main' of https://osredm.com/p04798526/LLaMA-Factory-Mirror	2024-08-20 14:30:32 +08:00
wql	39e97a5c5f	Merge branch 'main' of https://osredm.com/p04798526/LLaMA-Factory-Mirror	2024-08-20 13:52:46 +08:00
wql	0ab6f2836b	feat: add cur_time to log	2024-08-20 10:35:46 +08:00