parent
1539c72b94
commit
eed33862bc
|
@ -107,13 +107,13 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_l
|
||||||
|
|
||||||
### LoRA Fine-Tuning on Multiple GPUs
|
### LoRA Fine-Tuning on Multiple GPUs
|
||||||
|
|
||||||
#### Supervised Fine-Tuning with Accelerate on Single Node
|
#### Supervised Fine-Tuning on Single Node
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
|
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Supervised Fine-Tuning with Accelerate on Multiple Nodes
|
#### Supervised Fine-Tuning on Multiple Nodes
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
|
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
|
||||||
|
@ -136,13 +136,13 @@ ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu
|
||||||
|
|
||||||
### Full-Parameter Fine-Tuning on Multiple GPUs
|
### Full-Parameter Fine-Tuning on Multiple GPUs
|
||||||
|
|
||||||
#### Supervised Fine-Tuning with Accelerate on Single Node
|
#### Supervised Fine-Tuning on Single Node
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
|
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Supervised Fine-Tuning with Accelerate on Multiple Nodes
|
#### Supervised Fine-Tuning on Multiple Nodes
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
|
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
|
||||||
|
|
|
@ -107,13 +107,13 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_l
|
||||||
|
|
||||||
### 多 GPU LoRA 微调
|
### 多 GPU LoRA 微调
|
||||||
|
|
||||||
#### 使用 Accelerate 进行单节点训练
|
#### 在单机上进行指令监督微调
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
|
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 使用 Accelerate 进行多节点训练
|
#### 在多机上进行指令监督微调
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
|
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
|
||||||
|
@ -128,7 +128,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llam
|
||||||
|
|
||||||
### 多 NPU LoRA 微调
|
### 多 NPU LoRA 微调
|
||||||
|
|
||||||
#### 使用 DeepSpeed ZeRO-0 训练
|
#### 使用 DeepSpeed ZeRO-0 进行指令监督微调
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu/llama3_lora_sft_ds.yaml
|
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu/llama3_lora_sft_ds.yaml
|
||||||
|
@ -136,13 +136,13 @@ ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu
|
||||||
|
|
||||||
### 多 GPU 全参数微调
|
### 多 GPU 全参数微调
|
||||||
|
|
||||||
#### 使用 DeepSpeed 进行单节点训练
|
#### 在单机上进行指令监督微调
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
|
CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 使用 DeepSpeed 进行多节点训练
|
#### 在多机上进行指令监督微调
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
|
CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
|
||||||
|
|
|
@ -28,10 +28,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
pure_bf16: true
|
pure_bf16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -29,10 +29,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -29,10 +29,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 1
|
gradient_accumulation_steps: 1
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
pure_bf16: true
|
pure_bf16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -27,10 +27,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -26,10 +26,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -26,10 +26,10 @@ overwrite_output_dir: true
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
optim: paged_adamw_8bit
|
optim: paged_adamw_8bit
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
pure_bf16: true
|
pure_bf16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -28,10 +28,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 2
|
gradient_accumulation_steps: 2
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -28,10 +28,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 2
|
gradient_accumulation_steps: 2
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -29,10 +29,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 2
|
gradient_accumulation_steps: 2
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -29,10 +29,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 2
|
gradient_accumulation_steps: 2
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -27,10 +27,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 0.000005
|
learning_rate: 5.0e-6
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -25,10 +25,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 0.000005
|
learning_rate: 5.0e-6
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -26,10 +26,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 0.00001
|
learning_rate: 1.0e-5
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### generate
|
### generate
|
||||||
|
|
|
@ -24,10 +24,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -25,10 +25,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 0.00001
|
learning_rate: 1.0e-5
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -25,10 +25,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -26,10 +26,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -25,10 +25,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -25,10 +25,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -26,10 +26,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -25,10 +25,10 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 0.0001
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_steps: 0.1
|
warmup_ratio: 0.1
|
||||||
fp16: true
|
fp16: true
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
|
|
|
@ -107,7 +107,7 @@ class ModelArguments:
|
||||||
)
|
)
|
||||||
vllm_maxlen: int = field(
|
vllm_maxlen: int = field(
|
||||||
default=2048,
|
default=2048,
|
||||||
metadata={"help": "Maximum sequence length of the vLLM engine (including prompt and output)."},
|
metadata={"help": "Maximum sequence (prompt + response) length of the vLLM engine."},
|
||||||
)
|
)
|
||||||
vllm_gpu_util: float = field(
|
vllm_gpu_util: float = field(
|
||||||
default=0.9,
|
default=0.9,
|
||||||
|
|
Loading…
Reference in New Issue