From df9b4fb90a076c18f533da32beb7c42ae5b9ed22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E4=B8=80=E5=8D=9A?= Date: Wed, 20 Mar 2024 14:11:28 +0800 Subject: [PATCH] Updated README with new information --- .DS_Store | Bin 0 -> 8196 bytes README.md | 3 ++- README_zh.md | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..e130a32edfd31386156d7e87471065f412040b34 GIT binary patch literal 8196 zcmeHM%We}f6g^G@4G&dSRiti^<_DUJKbVFDsZ!Df6>K4y1R64FLNciuRL6K4VdlR{C=e6~3IqlI1qHZgOKR-c_wC(K1_gow|D^)*eu$XG#?I!} z>Z^l=9sy|UY}Uj&&jHGboQ<8$tyL<0+S7w^RO5~q#?pB|5;$$_Y;LWk!&o|u>)E&y ziqYQ5ixdtMTN}!tKu};?0Xe%rMjuQ5jY{WtSxV>3&5b4{*D)kIL%(p&7u# z{0mtL)owD2epQMUxlizxQJqt_9&MA1p&wms%2(qH8{KBZaaAzp*`T{xW((eJaeQme zSjIS}IKz8JA<9lgC0@5x<34NebY2(QnsTOAPUQmLb=osd_mM7zk8g|LJBa#xg>F3O5Akn`Nr;sd>1S1Q7d#2gnCIb;kYeB}tnT9dKf zFt(-6!X2`bqVjYU=U|`6gv>GL(ElFh){{B#o({Y2Rzx>t=6&z-ba?nZdS4*%QL=J8 z)FbvRns4=QxL!OYgWs8CtBsh&#?I!}N|UtfE=(Kte-R|4puhuCV9&;SBIp0k?(hE( zXhb2@pg>UIPZcor_Eoz@mhS2mY}w@_=8w#h1eaT@6c*kO0bc+9hau|`p~8us&8=k% QQ}_^|GK3;1@K+W11@U?mbpQYW literal 0 HcmV?d00001 diff --git a/README.md b/README.md index a0588a5a..1d3366b6 100644 --- a/README.md +++ b/README.md @@ -519,7 +519,8 @@ use_cpu: false ```bash deepspeed --num_gpus 8 src/train_bash.py \ - --deepspeed ds_config.json \ + --deepspeed ds_config.json \ + --ddp_timeout 180000000 \ # If the training data is too large, it is recommended to add the ddp_timeout command line option to prevent NCCL errors. ... # arguments (same as above) ``` diff --git a/README_zh.md b/README_zh.md index 24ba3e12..594dc651 100644 --- a/README_zh.md +++ b/README_zh.md @@ -519,7 +519,9 @@ use_cpu: false ```bash deepspeed --num_gpus 8 src/train_bash.py \ --deepspeed ds_config.json \ + --ddp_timeout 180000000 \ # 如训练数据过大,建议加上ddp_timeout命令行,防止nccl报错 ... # 参数同上 + ```
使用 DeepSpeed ZeRO-2 进行全参数训练的 ds_config.json 示例