From 91db09b6c753d0f0cea34171b66361c03f7046af Mon Sep 17 00:00:00 2001 From: wql Date: Wed, 4 Sep 2024 16:15:17 +0800 Subject: [PATCH] feat: add gpu_status.py --- gpu_status.py | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 gpu_status.py diff --git a/gpu_status.py b/gpu_status.py new file mode 100644 index 00000000..28df947c --- /dev/null +++ b/gpu_status.py @@ -0,0 +1,56 @@ +import json +import pynvml +import time +import psutil + + +def main(): + UNIT = 1024 * 1024 * 1024 + + pynvml.nvmlInit() + gpuDeviceCount = pynvml.nvmlDeviceGetCount() + start_time = time.time() + + while time.time() - start_time < 3600 *24: + # print(time.time() - start_time) + all_gpu_status = [] + for i in range(gpuDeviceCount): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle) + all_gpu_status.append(dict( + device = i, + total_mem_GB = memoryInfo.total/UNIT, + used_mem_GB = memoryInfo.used/UNIT, + powerusage_W = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000 + )) + + all_processes_status = [] + pidAllInfo = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + for pidInfo in pidAllInfo: + pidUser = psutil.Process(pidInfo.pid).username() + pidCreateTime = psutil.Process(pidInfo.pid).create_time() + pidName = psutil.Process(pidInfo.pid).name() + all_processes_status.append(dict( + pid = pidInfo.pid, + create_time = pidCreateTime, + name = pidName, + user = pidUser, + used_mem_GB = pidInfo.usedGpuMemory/UNIT + )) + + logs = dict( + cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), + all_gpu_status = all_gpu_status, + all_processes_status = all_processes_status + ) + formatted_time = time.strftime('%Y%m%d%H%M%S', time.localtime()) + with open(f"./results/gpu_status_{formatted_time}.json", "a", encoding="utf-8") as f: + f.write(json.dumps(logs) + "\n") + print(logs) + + time.sleep(60) + pynvml.nvmlShutdown() + + +if __name__ == "__main__": + main()