This commit is contained in:
hiyouga 2024-06-03 19:12:29 +08:00
parent 1539c72b94
commit eed33862bc
24 changed files with 52 additions and 52 deletions

View File

@ -107,13 +107,13 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_l
### LoRA Fine-Tuning on Multiple GPUs
#### Supervised Fine-Tuning with Accelerate on Single Node
#### Supervised Fine-Tuning on Single Node
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
```
#### Supervised Fine-Tuning with Accelerate on Multiple Nodes
#### Supervised Fine-Tuning on Multiple Nodes
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
@ -136,13 +136,13 @@ ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu
### Full-Parameter Fine-Tuning on Multiple GPUs
#### Supervised Fine-Tuning with Accelerate on Single Node
#### Supervised Fine-Tuning on Single Node
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
```
#### Supervised Fine-Tuning with Accelerate on Multiple Nodes
#### Supervised Fine-Tuning on Multiple Nodes
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml

View File

@ -107,13 +107,13 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_l
### 多 GPU LoRA 微调
#### 使用 Accelerate 进行单节点训练
#### 在单机上进行指令监督微调
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
```
#### 使用 Accelerate 进行多节点训练
#### 在多机上进行指令监督微调
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
@ -128,7 +128,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llam
### 多 NPU LoRA 微调
#### 使用 DeepSpeed ZeRO-0 训练
#### 使用 DeepSpeed ZeRO-0 进行指令监督微调
```bash
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu/llama3_lora_sft_ds.yaml
@ -136,13 +136,13 @@ ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu
### 多 GPU 全参数微调
#### 使用 DeepSpeed 进行单节点训练
#### 在单机上进行指令监督微调
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
```
#### 使用 DeepSpeed 进行多节点训练
#### 在多机上进行指令监督微调
```bash
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml

View File

@ -28,10 +28,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
pure_bf16: true
### eval

View File

@ -29,10 +29,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -29,10 +29,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 1
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
pure_bf16: true
### eval

View File

@ -27,10 +27,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -26,10 +26,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -26,10 +26,10 @@ overwrite_output_dir: true
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
optim: paged_adamw_8bit
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
pure_bf16: true
### eval

View File

@ -28,10 +28,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 2
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -28,10 +28,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 2
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -29,10 +29,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 2
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -29,10 +29,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 2
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -27,10 +27,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 0.000005
learning_rate: 5.0e-6
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -25,10 +25,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 0.000005
learning_rate: 5.0e-6
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -26,10 +26,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 0.00001
learning_rate: 1.0e-5
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### generate

View File

@ -24,10 +24,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -25,10 +25,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 0.00001
learning_rate: 1.0e-5
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -25,10 +25,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -26,10 +26,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -25,10 +25,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -25,10 +25,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -26,10 +26,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -25,10 +25,10 @@ overwrite_output_dir: true
### train
per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 0.0001
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_steps: 0.1
warmup_ratio: 0.1
fp16: true
### eval

View File

@ -107,7 +107,7 @@ class ModelArguments:
)
vllm_maxlen: int = field(
default=2048,
metadata={"help": "Maximum sequence length of the vLLM engine (including prompt and output)."},
metadata={"help": "Maximum sequence (prompt + response) length of the vLLM engine."},
)
vllm_gpu_util: float = field(
default=0.9,