Compare commits
4 Commits
0ae3f28774
...
07b328ee23
Author | SHA1 | Date |
---|---|---|
wql | 07b328ee23 | |
wql | abf6ab0743 | |
wql | 39e97a5c5f | |
wql | 0ab6f2836b |
|
@ -0,0 +1,37 @@
|
||||||
|
import json
|
||||||
|
import pynvml
|
||||||
|
import time
|
||||||
|
|
||||||
|
def main():
|
||||||
|
UNIT = 1024 * 1024 * 1024
|
||||||
|
|
||||||
|
pynvml.nvmlInit()
|
||||||
|
gpuDeviceCount = pynvml.nvmlDeviceGetCount()
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
while time.time() - start_time < 3600 *16:
|
||||||
|
# print(time.time() - start_time)
|
||||||
|
all_gpu_status = []
|
||||||
|
for i in range(gpuDeviceCount):
|
||||||
|
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
||||||
|
memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
||||||
|
all_gpu_status.append(dict(
|
||||||
|
device = i,
|
||||||
|
total_mem_GB = memoryInfo.total/UNIT,
|
||||||
|
used_mem_GB = memoryInfo.used/UNIT,
|
||||||
|
powerusage_W = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000
|
||||||
|
))
|
||||||
|
|
||||||
|
logs = dict(
|
||||||
|
cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
|
||||||
|
all_gpu_status = all_gpu_status
|
||||||
|
)
|
||||||
|
with open("./results/gpu_status.json", "a", encoding="utf-8") as f:
|
||||||
|
f.write(json.dumps(logs) + "\n")
|
||||||
|
# print(logs)
|
||||||
|
time.sleep(60)
|
||||||
|
pynvml.nvmlShutdown()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -302,6 +302,7 @@ class LogCallback(TrainerCallback):
|
||||||
learning_rate=state.log_history[-1].get("learning_rate", None),
|
learning_rate=state.log_history[-1].get("learning_rate", None),
|
||||||
epoch=state.log_history[-1].get("epoch", None),
|
epoch=state.log_history[-1].get("epoch", None),
|
||||||
percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
|
percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
|
||||||
|
cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
|
||||||
elapsed_time=self.elapsed_time,
|
elapsed_time=self.elapsed_time,
|
||||||
remaining_time=self.remaining_time,
|
remaining_time=self.remaining_time,
|
||||||
throughput="{:.2f}".format(state.num_input_tokens_seen / (time.time() - self.start_time)),
|
throughput="{:.2f}".format(state.num_input_tokens_seen / (time.time() - self.start_time)),
|
||||||
|
|
Loading…
Reference in New Issue