forked from p04798526/LLaMA-Factory-Mirror
fix #4137
This commit is contained in:
parent
cce0fad91c
commit
8bf9da659c
|
@ -72,12 +72,8 @@ def main():
|
|||
elif command == Command.EXPORT:
|
||||
export_model()
|
||||
elif command == Command.TRAIN:
|
||||
disable_torchrun = os.environ.get("TORCHRUN_DISABLED", "0").lower() in ["true", "1"]
|
||||
if disable_torchrun and get_device_count() > 1:
|
||||
logger.warning("`torchrun` cannot be disabled when device count > 1.")
|
||||
disable_torchrun = False
|
||||
|
||||
if (not disable_torchrun) and (get_device_count() > 0):
|
||||
force_torchrun = os.environ.get("FORCE_TORCHRUN", "0").lower() in ["true", "1"]
|
||||
if force_torchrun or get_device_count() > 1:
|
||||
master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1")
|
||||
master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999)))
|
||||
logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port))
|
||||
|
|
|
@ -278,6 +278,9 @@ class Runner:
|
|||
args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
|
||||
env = deepcopy(os.environ)
|
||||
env["LLAMABOARD_ENABLED"] = "1"
|
||||
if args.get("deepspeed", None) is not None:
|
||||
env["FORCE_TORCHRUN"] = "1"
|
||||
|
||||
self.trainer = Popen("llamafactory-cli train {}".format(save_cmd(args)), env=env, shell=True)
|
||||
yield from self.monitor()
|
||||
|
||||
|
|
Loading…
Reference in New Issue