Merge pull request #4066 from injet-zhou/main
add throughput entry to training log
This commit is contained in:
commit
f2580ad403
|
@ -170,12 +170,14 @@ class LogCallback(TrainerCallback):
|
||||||
percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
|
percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
|
||||||
elapsed_time=self.elapsed_time,
|
elapsed_time=self.elapsed_time,
|
||||||
remaining_time=self.remaining_time,
|
remaining_time=self.remaining_time,
|
||||||
|
throughput="{:.2f}".format(state.num_input_tokens_seen / (time.time() - self.start_time)),
|
||||||
|
total_tokens=state.num_input_tokens_seen,
|
||||||
)
|
)
|
||||||
logs = {k: v for k, v in logs.items() if v is not None}
|
logs = {k: v for k, v in logs.items() if v is not None}
|
||||||
if self.webui_mode and all(key in logs for key in ["loss", "learning_rate", "epoch"]):
|
if self.webui_mode and all(key in logs for key in ["loss", "learning_rate", "epoch"]):
|
||||||
logger.info(
|
logger.info(
|
||||||
"{{'loss': {:.4f}, 'learning_rate': {:2.4e}, 'epoch': {:.2f}}}".format(
|
"{{'loss': {:.4f}, 'learning_rate': {:2.4e}, 'epoch': {:.2f}, 'throughput': {}}}".format(
|
||||||
logs["loss"], logs["learning_rate"], logs["epoch"]
|
logs["loss"], logs["learning_rate"], logs["epoch"], logs["throughput"]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -132,6 +132,7 @@ class Runner:
|
||||||
pure_bf16=(get("train.compute_type") == "pure_bf16"),
|
pure_bf16=(get("train.compute_type") == "pure_bf16"),
|
||||||
plot_loss=True,
|
plot_loss=True,
|
||||||
ddp_timeout=180000000,
|
ddp_timeout=180000000,
|
||||||
|
include_num_input_tokens_seen=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# checkpoints
|
# checkpoints
|
||||||
|
|
Loading…
Reference in New Issue