import json import sys import pynvml import time import psutil UNIT = 1024 * 1024 * 1024 def gpu_status(output_path = "./results/gpu_status", print_status = False): pynvml.nvmlInit() gpuDeviceCount = pynvml.nvmlDeviceGetCount() start_time = time.time() first_loop = True while time.time() - start_time < 3600 *24: # print(time.time() - start_time) all_gpu_status = [] for i in range(gpuDeviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(i) memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle) all_gpu_status.append(dict( device = i, total_mem_GB = memoryInfo.total/UNIT, used_mem_GB = memoryInfo.used/UNIT, powerusage_W = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000 )) all_processes_status = [] pidAllInfo = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for pidInfo in pidAllInfo: pidUser = psutil.Process(pidInfo.pid).username() pidCreateTime = psutil.Process(pidInfo.pid).create_time() pidName = psutil.Process(pidInfo.pid).name() all_processes_status.append(dict( pid = pidInfo.pid, create_time = pidCreateTime, name = pidName, user = pidUser, used_mem_GB = pidInfo.usedGpuMemory/UNIT )) logs = dict( cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), all_gpu_status = all_gpu_status, all_processes_status = all_processes_status ) with open(f"{output_path}/gpu_status.json", "a", encoding="utf-8") as f: f.write(json.dumps(logs) + "\n") if first_loop: print("Start run gpu_status.py") first_loop = False if print_status: print(logs) time.sleep(60) pynvml.nvmlShutdown() def main(): output_path = sys.argv[1] print_status = sys.argv[2] gpu_status(output_path, print_status) if __name__ == "__main__": main()