2024-09-04 16:15:17 +08:00
|
|
|
import json
|
2024-09-05 11:28:19 +08:00
|
|
|
import sys
|
2024-09-04 16:15:17 +08:00
|
|
|
import pynvml
|
|
|
|
import time
|
|
|
|
import psutil
|
|
|
|
|
2024-09-05 11:28:19 +08:00
|
|
|
UNIT = 1024 * 1024 * 1024
|
2024-09-04 16:15:17 +08:00
|
|
|
|
2024-09-05 11:28:19 +08:00
|
|
|
def gpu_status(output_path = "./results/gpu_status", print_status = False):
|
2024-09-04 16:15:17 +08:00
|
|
|
pynvml.nvmlInit()
|
|
|
|
gpuDeviceCount = pynvml.nvmlDeviceGetCount()
|
|
|
|
start_time = time.time()
|
2024-09-05 11:28:19 +08:00
|
|
|
first_loop = True
|
|
|
|
|
2024-09-04 16:15:17 +08:00
|
|
|
while time.time() - start_time < 3600 *24:
|
|
|
|
# print(time.time() - start_time)
|
|
|
|
all_gpu_status = []
|
|
|
|
for i in range(gpuDeviceCount):
|
|
|
|
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
|
|
|
memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
|
|
all_gpu_status.append(dict(
|
|
|
|
device = i,
|
|
|
|
total_mem_GB = memoryInfo.total/UNIT,
|
|
|
|
used_mem_GB = memoryInfo.used/UNIT,
|
|
|
|
powerusage_W = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000
|
|
|
|
))
|
|
|
|
|
|
|
|
all_processes_status = []
|
|
|
|
pidAllInfo = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
|
|
|
|
for pidInfo in pidAllInfo:
|
|
|
|
pidUser = psutil.Process(pidInfo.pid).username()
|
|
|
|
pidCreateTime = psutil.Process(pidInfo.pid).create_time()
|
|
|
|
pidName = psutil.Process(pidInfo.pid).name()
|
|
|
|
all_processes_status.append(dict(
|
|
|
|
pid = pidInfo.pid,
|
|
|
|
create_time = pidCreateTime,
|
|
|
|
name = pidName,
|
|
|
|
user = pidUser,
|
|
|
|
used_mem_GB = pidInfo.usedGpuMemory/UNIT
|
|
|
|
))
|
|
|
|
|
|
|
|
logs = dict(
|
|
|
|
cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
|
|
|
|
all_gpu_status = all_gpu_status,
|
|
|
|
all_processes_status = all_processes_status
|
|
|
|
)
|
2024-09-05 11:28:19 +08:00
|
|
|
|
|
|
|
with open(f"{output_path}/gpu_status.json", "a", encoding="utf-8") as f:
|
2024-09-04 16:15:17 +08:00
|
|
|
f.write(json.dumps(logs) + "\n")
|
2024-09-05 11:28:19 +08:00
|
|
|
|
|
|
|
if first_loop:
|
|
|
|
print("Start run gpu_status.py")
|
|
|
|
first_loop = False
|
|
|
|
|
|
|
|
if print_status:
|
|
|
|
print(logs)
|
2024-09-04 16:15:17 +08:00
|
|
|
|
|
|
|
time.sleep(60)
|
2024-09-05 11:28:19 +08:00
|
|
|
|
2024-09-04 16:15:17 +08:00
|
|
|
pynvml.nvmlShutdown()
|
|
|
|
|
2024-09-05 11:28:19 +08:00
|
|
|
def main():
|
|
|
|
output_path = sys.argv[1]
|
|
|
|
print_status = sys.argv[2]
|
|
|
|
gpu_status(output_path, print_status)
|
2024-09-04 16:15:17 +08:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|