LLaMA-Factory-310P3/gpu_status.py

70 lines
2.2 KiB
Python
Raw Normal View History

2024-09-04 16:15:17 +08:00
import json
2024-09-05 11:28:19 +08:00
import sys
2024-09-04 16:15:17 +08:00
import pynvml
import time
import psutil
2024-09-05 11:28:19 +08:00
UNIT = 1024 * 1024 * 1024
2024-09-04 16:15:17 +08:00
2024-09-05 13:37:17 +08:00
def gpu_status(output_path = "./results/gpu_status", print_status = False, sleep_time = 60):
2024-09-04 16:15:17 +08:00
pynvml.nvmlInit()
gpuDeviceCount = pynvml.nvmlDeviceGetCount()
start_time = time.time()
2024-09-05 11:28:19 +08:00
first_loop = True
2024-09-04 16:15:17 +08:00
while time.time() - start_time < 3600 *24:
# print(time.time() - start_time)
all_gpu_status = []
for i in range(gpuDeviceCount):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
all_gpu_status.append(dict(
device = i,
total_mem_GB = memoryInfo.total/UNIT,
used_mem_GB = memoryInfo.used/UNIT,
powerusage_W = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000
))
all_processes_status = []
pidAllInfo = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
for pidInfo in pidAllInfo:
pidUser = psutil.Process(pidInfo.pid).username()
pidCreateTime = psutil.Process(pidInfo.pid).create_time()
pidName = psutil.Process(pidInfo.pid).name()
all_processes_status.append(dict(
pid = pidInfo.pid,
create_time = pidCreateTime,
name = pidName,
user = pidUser,
used_mem_GB = pidInfo.usedGpuMemory/UNIT
))
logs = dict(
cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
all_gpu_status = all_gpu_status,
all_processes_status = all_processes_status
)
2024-09-05 11:28:19 +08:00
with open(f"{output_path}/gpu_status.json", "a", encoding="utf-8") as f:
2024-09-04 16:15:17 +08:00
f.write(json.dumps(logs) + "\n")
2024-09-05 11:28:19 +08:00
if first_loop:
print("Start run gpu_status.py")
first_loop = False
if print_status:
print(logs)
2024-09-04 16:15:17 +08:00
2024-09-05 13:37:17 +08:00
time.sleep(sleep_time)
2024-09-05 11:28:19 +08:00
2024-09-04 16:15:17 +08:00
pynvml.nvmlShutdown()
2024-09-05 11:28:19 +08:00
def main():
output_path = sys.argv[1]
print_status = sys.argv[2]
2024-09-05 13:37:17 +08:00
sleep_time = sys.argv[3]
gpu_status(output_path, print_status, sleep_time)
2024-09-04 16:15:17 +08:00
if __name__ == "__main__":
main()