Compare commits

...

4 Commits

2 changed files with 38 additions and 0 deletions

37
src/gpu_status.py Normal file
View File

@ -0,0 +1,37 @@
import json
import pynvml
import time
def main():
UNIT = 1024 * 1024 * 1024
pynvml.nvmlInit()
gpuDeviceCount = pynvml.nvmlDeviceGetCount()
start_time = time.time()
while time.time() - start_time < 3600 *16:
# print(time.time() - start_time)
all_gpu_status = []
for i in range(gpuDeviceCount):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
all_gpu_status.append(dict(
device = i,
total_mem_GB = memoryInfo.total/UNIT,
used_mem_GB = memoryInfo.used/UNIT,
powerusage_W = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000
))
logs = dict(
cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
all_gpu_status = all_gpu_status
)
with open("./results/gpu_status.json", "a", encoding="utf-8") as f:
f.write(json.dumps(logs) + "\n")
# print(logs)
time.sleep(60)
pynvml.nvmlShutdown()
if __name__ == "__main__":
main()

View File

@ -302,6 +302,7 @@ class LogCallback(TrainerCallback):
learning_rate=state.log_history[-1].get("learning_rate", None),
epoch=state.log_history[-1].get("epoch", None),
percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
elapsed_time=self.elapsed_time,
remaining_time=self.remaining_time,
throughput="{:.2f}".format(state.num_input_tokens_seen / (time.time() - self.start_time)),