diff --git a/examples/README.md b/examples/README.md index 871bf0de..111f50bd 100644 --- a/examples/README.md +++ b/examples/README.md @@ -19,11 +19,11 @@ examples/ ├── lora_multi_gpu/ │ ├── single_node.sh: Fine-tune model with Accelerate on single node using LoRA │ ├── multi_node.sh: Fine-tune model with Accelerate on multiple nodes using LoRA -│ └── ds_zero3.sh: Fine-tune model with DeepSpeed ZeRO-3 using LoRA +│ └── ds_zero3.sh: Fine-tune model with DeepSpeed ZeRO-3 using LoRA (weight sharding) ├── full_multi_gpu/ │ ├── single_node.sh: Full fine-tune model with DeepSpeed on single node │ ├── multi_node.sh: Full fine-tune model with DeepSpeed on multiple nodes -│ └── predict.sh: Do batch predict and compute BLEU and ROUGE scores after full tuning +│ └── predict.sh: Do parallel batch predict and compute BLEU and ROUGE scores after full tuning ├── merge_lora/ │ ├── merge.sh: Merge LoRA weights into the pre-trained models │ └── quantize.sh: Quantize the fine-tuned model with AutoGPTQ diff --git a/examples/README_zh.md b/examples/README_zh.md index c4f2062e..fecbdb2f 100644 --- a/examples/README_zh.md +++ b/examples/README_zh.md @@ -19,11 +19,11 @@ examples/ ├── lora_multi_gpu/ │ ├── single_node.sh: 使用 Accelerate 进行单节点 LoRA 训练 │ ├── multi_node.sh: 使用 Accelerate 进行多节点 LoRA 训练 -│ └── ds_zero3.sh: 使用 DeepSpeed ZeRO-3 进行 LoRA 训练 +│ └── ds_zero3.sh: 使用 DeepSpeed ZeRO-3 进行 LoRA 训练(拆分权重) ├── full_multi_gpu/ │ ├── single_node.sh: 使用 DeepSpeed 进行单节点全量训练 │ ├── multi_node.sh: 使用 DeepSpeed 进行多节点全量训练 -│ └── predict.sh: 基于全量训练进行批量预测并计算 BLEU 和 ROUGE 分数 +│ └── predict.sh: 基于全量训练进行多卡批量预测并计算 BLEU 和 ROUGE 分数 ├── merge_lora/ │ ├── merge.sh: 将 LoRA 权重合并到预训练模型中 │ └── quantize.sh: 使用 AutoGPTQ 量化微调后的模型 diff --git a/examples/accelerate/master_config.yaml b/examples/accelerate/master_config.yaml index 9c8fc275..a1018313 100644 --- a/examples/accelerate/master_config.yaml +++ b/examples/accelerate/master_config.yaml @@ -9,7 +9,7 @@ main_process_port: 29555 main_training_function: main mixed_precision: fp16 num_machines: 2 # the number of nodes -num_processes: 16 # the number of GPUs in all nodes +num_processes: 8 # the number of GPUs in all nodes rdzv_backend: static same_network: true tpu_env: [] diff --git a/examples/accelerate/slave_config.yaml b/examples/accelerate/slave_config.yaml index e4a63e82..e610fd0e 100644 --- a/examples/accelerate/slave_config.yaml +++ b/examples/accelerate/slave_config.yaml @@ -9,7 +9,7 @@ main_process_port: 29555 main_training_function: main mixed_precision: fp16 num_machines: 2 # the number of nodes -num_processes: 16 # the number of GPUs in all nodes +num_processes: 8 # the number of GPUs in all nodes rdzv_backend: static same_network: true tpu_env: [] diff --git a/examples/full_multi_gpu/predict.sh b/examples/full_multi_gpu/predict.sh index 52fdc7a0..801df85a 100644 --- a/examples/full_multi_gpu/predict.sh +++ b/examples/full_multi_gpu/predict.sh @@ -1,6 +1,8 @@ #!/bin/bash -CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \ +CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \ + --config_file ../accelerate/single_config.yaml \ + ../../src/train_bash.py \ --stage sft \ --do_predict \ --model_name_or_path ../../saves/LLaMA2-7B/full/sft \ diff --git a/examples/lora_multi_gpu/multi_node.sh b/examples/lora_multi_gpu/multi_node.sh index 5172b9a6..85a3e026 100644 --- a/examples/lora_multi_gpu/multi_node.sh +++ b/examples/lora_multi_gpu/multi_node.sh @@ -1,4 +1,5 @@ #!/bin/bash +# also launch it on slave machine using slave_config.yaml CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \ --config_file ../accelerate/master_config.yaml \ diff --git a/examples/lora_multi_gpu/single_node.sh b/examples/lora_multi_gpu/single_node.sh index 269d76d7..04529cf0 100644 --- a/examples/lora_multi_gpu/single_node.sh +++ b/examples/lora_multi_gpu/single_node.sh @@ -1,6 +1,6 @@ #!/bin/bash -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch \ +CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \ --config_file ../accelerate/single_config.yaml \ ../../src/train_bash.py \ --stage sft \