LLaMA-Factory-310P3/gpu_status.py

import json
import pynvml 
import time
import psutil


def main():
    UNIT = 1024 * 1024 * 1024

    pynvml.nvmlInit()
    gpuDeviceCount = pynvml.nvmlDeviceGetCount()
    start_time = time.time()

    while time.time() - start_time < 3600 *24:
        # print(time.time() - start_time)
        all_gpu_status = []
        for i in range(gpuDeviceCount):
            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
            memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
            all_gpu_status.append(dict(
                device = i,
                total_mem_GB = memoryInfo.total/UNIT,
                used_mem_GB = memoryInfo.used/UNIT,
                powerusage_W = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000
            ))
        
        all_processes_status = []
        pidAllInfo = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
        for pidInfo in pidAllInfo:
            pidUser = psutil.Process(pidInfo.pid).username()
            pidCreateTime = psutil.Process(pidInfo.pid).create_time()
            pidName = psutil.Process(pidInfo.pid).name()
            all_processes_status.append(dict(
                pid = pidInfo.pid,
                create_time = pidCreateTime,
                name = pidName,
                user = pidUser,
                used_mem_GB = pidInfo.usedGpuMemory/UNIT
            ))

        logs = dict(
            cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            all_gpu_status = all_gpu_status,
            all_processes_status =  all_processes_status
        )
        formatted_time = time.strftime('%Y%m%d%H%M%S', time.localtime())
        with open(f"./results/gpu_status/gpu_status_{formatted_time}.json", "a", encoding="utf-8") as f:
            f.write(json.dumps(logs) + "\n")
        print(logs)

        time.sleep(60)
    pynvml.nvmlShutdown()


if __name__ == "__main__":
    main()
feat: add gpu_status.py 2024-09-04 16:15:17 +08:00			`import json`
			`import pynvml`
			`import time`
			`import psutil`


			`def main():`
			`UNIT = 1024 * 1024 * 1024`

			`pynvml.nvmlInit()`
			`gpuDeviceCount = pynvml.nvmlDeviceGetCount()`
			`start_time = time.time()`

			`while time.time() - start_time < 3600 *24:`
			`# print(time.time() - start_time)`
			`all_gpu_status = []`
			`for i in range(gpuDeviceCount):`
			`handle = pynvml.nvmlDeviceGetHandleByIndex(i)`
			`memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)`
			`all_gpu_status.append(dict(`
			`device = i,`
			`total_mem_GB = memoryInfo.total/UNIT,`
			`used_mem_GB = memoryInfo.used/UNIT,`
			`powerusage_W = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000`
			`))`

			`all_processes_status = []`
			`pidAllInfo = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)`
			`for pidInfo in pidAllInfo:`
			`pidUser = psutil.Process(pidInfo.pid).username()`
			`pidCreateTime = psutil.Process(pidInfo.pid).create_time()`
			`pidName = psutil.Process(pidInfo.pid).name()`
			`all_processes_status.append(dict(`
			`pid = pidInfo.pid,`
			`create_time = pidCreateTime,`
			`name = pidName,`
			`user = pidUser,`
			`used_mem_GB = pidInfo.usedGpuMemory/UNIT`
			`))`

			`logs = dict(`
			`cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),`
			`all_gpu_status = all_gpu_status,`
			`all_processes_status = all_processes_status`
			`)`
			`formatted_time = time.strftime('%Y%m%d%H%M%S', time.localtime())`
chore: change gpu status save folder 2024-09-04 16:24:15 +08:00			`with open(f"./results/gpu_status/gpu_status_{formatted_time}.json", "a", encoding="utf-8") as f:`
feat: add gpu_status.py 2024-09-04 16:15:17 +08:00			`f.write(json.dumps(logs) + "\n")`
			`print(logs)`

			`time.sleep(60)`
			`pynvml.nvmlShutdown()`


			`if __name__ == "__main__":`
			`main()`