Merge branch 'hiyouga:main' into main

2024-04-23 18:46:12 +08:00 · 2024-04-23 18:46:12 +08:00 · cde4dfe569
parent 4dcb11eab7 2efd9b6ba0
commit cde4dfe569
7 changed files with 11 additions and 8 deletions
--- a/examples/README.md
+++ b/examples/README.md
@ -19,11 +19,11 @@ examples/
 ├── lora_multi_gpu/
 │   ├── single_node.sh: Fine-tune model with Accelerate on single node using LoRA
 │   ├── multi_node.sh: Fine-tune model with Accelerate on multiple nodes using LoRA
-│   └── ds_zero3.sh: Fine-tune model with DeepSpeed ZeRO-3 using LoRA
+│   └── ds_zero3.sh: Fine-tune model with DeepSpeed ZeRO-3 using LoRA (weight sharding)
 ├── full_multi_gpu/
 │   ├── single_node.sh: Full fine-tune model with DeepSpeed on single node
 │   ├── multi_node.sh: Full fine-tune model with DeepSpeed on multiple nodes
-│   └── predict.sh: Do batch predict and compute BLEU and ROUGE scores after full tuning
+│   └── predict.sh: Do parallel batch predict and compute BLEU and ROUGE scores after full tuning
 ├── merge_lora/
 │   ├── merge.sh: Merge LoRA weights into the pre-trained models
 │   └── quantize.sh: Quantize the fine-tuned model with AutoGPTQ
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@ -19,11 +19,11 @@ examples/
 ├── lora_multi_gpu/
 │   ├── single_node.sh: 使用 Accelerate 进行单节点 LoRA 训练
 │   ├── multi_node.sh: 使用 Accelerate 进行多节点 LoRA 训练
-│   └── ds_zero3.sh: 使用 DeepSpeed ZeRO-3 进行 LoRA 训练
+│   └── ds_zero3.sh: 使用 DeepSpeed ZeRO-3 进行 LoRA 训练（拆分权重）
 ├── full_multi_gpu/
 │   ├── single_node.sh: 使用 DeepSpeed 进行单节点全量训练
 │   ├── multi_node.sh: 使用 DeepSpeed 进行多节点全量训练
-│   └── predict.sh: 基于全量训练进行批量预测并计算 BLEU 和 ROUGE 分数
+│   └── predict.sh: 基于全量训练进行多卡批量预测并计算 BLEU 和 ROUGE 分数
 ├── merge_lora/
 │   ├── merge.sh: 将 LoRA 权重合并到预训练模型中
 │   └── quantize.sh: 使用 AutoGPTQ 量化微调后的模型
--- a/examples/accelerate/master_config.yaml
+++ b/examples/accelerate/master_config.yaml
@ -9,7 +9,7 @@ main_process_port: 29555
 main_training_function: main
 mixed_precision: fp16
 num_machines: 2 # the number of nodes
-num_processes: 16 # the number of GPUs in all nodes
+num_processes: 8 # the number of GPUs in all nodes
 rdzv_backend: static
 same_network: true
 tpu_env: []
--- a/examples/accelerate/slave_config.yaml
+++ b/examples/accelerate/slave_config.yaml
@ -9,7 +9,7 @@ main_process_port: 29555
 main_training_function: main
 mixed_precision: fp16
 num_machines: 2 # the number of nodes
-num_processes: 16 # the number of GPUs in all nodes
+num_processes: 8 # the number of GPUs in all nodes
 rdzv_backend: static
 same_network: true
 tpu_env: []
--- a/examples/full_multi_gpu/predict.sh
+++ b/examples/full_multi_gpu/predict.sh
@ -1,6 +1,8 @@
 #!/bin/bash

-CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
+CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
+    --config_file ../accelerate/single_config.yaml \
+    ../../src/train_bash.py \
    --stage sft \
    --do_predict \
    --model_name_or_path ../../saves/LLaMA2-7B/full/sft \
--- a/examples/lora_multi_gpu/multi_node.sh
+++ b/examples/lora_multi_gpu/multi_node.sh
@ -1,4 +1,5 @@
 #!/bin/bash
+# also launch it on slave machine using slave_config.yaml

 CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
    --config_file ../accelerate/master_config.yaml \
--- a/examples/lora_multi_gpu/single_node.sh
+++ b/examples/lora_multi_gpu/single_node.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch \
+CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
    --config_file ../accelerate/single_config.yaml \
    ../../src/train_bash.py \
    --stage sft \