From 8c13a9e4659a64a76324ebbddd529ad30e6fca34 Mon Sep 17 00:00:00 2001 From: p04896573 Date: Sat, 11 May 2024 17:55:31 +0800 Subject: [PATCH] Update README_DISTRIBUTED.md --- quick_start_clean/readmes/README_DISTRIBUTED.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quick_start_clean/readmes/README_DISTRIBUTED.md b/quick_start_clean/readmes/README_DISTRIBUTED.md index e5eef76..0361481 100644 --- a/quick_start_clean/readmes/README_DISTRIBUTED.md +++ b/quick_start_clean/readmes/README_DISTRIBUTED.md @@ -20,7 +20,7 @@ CMD="torchrun --nnodes=2 --nproc_per_node=8 --rdzv_id=1 --rdzv_backend=c10d --rd 接下来,在这两个机器中都执行bash sft_cpm9g_8b.sh,这样就完成一次最简单的多机训练 不过机器多了之后不推荐这种方式 -### slurm 集群多机任务提交 +## slurm 集群多机任务提交 算力平台使用Slurm调度,常用Slurm命令包括: ``` shell