add DISABLE_TORCHRUN option

This commit is contained in:
hiyouga 2024-06-06 23:44:58 +08:00
parent 55c18c49b0
commit 45d8be8f93
1 changed files with 6 additions and 1 deletions

View File

@ -72,7 +72,12 @@ def main():
elif command == Command.EXPORT: elif command == Command.EXPORT:
export_model() export_model()
elif command == Command.TRAIN: elif command == Command.TRAIN:
if get_device_count() > 0: disable_torchrun = os.environ.get("DISABLE_TORCHRUN", "0").lower() in ["true", "1"]
if disable_torchrun and get_device_count() > 1:
logger.warning("`torchrun` cannot be disabled when device count > 1.")
disable_torchrun = False
if (not disable_torchrun) and (get_device_count() > 0):
master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1") master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1")
master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999))) master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999)))
logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port)) logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port))