fix #4944
This commit is contained in:
parent
1550fe7331
commit
1bbd49faae
|
@ -10,6 +10,7 @@ badam_mode: layer
|
||||||
badam_switch_mode: ascending
|
badam_switch_mode: ascending
|
||||||
badam_switch_interval: 50
|
badam_switch_interval: 50
|
||||||
badam_verbose: 2
|
badam_verbose: 2
|
||||||
|
# deepspeed: examples/deepspeed/ds_z3_config.json
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
dataset: identity,alpaca_en_demo
|
dataset: identity,alpaca_en_demo
|
||||||
|
@ -29,7 +30,7 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 1.0e-4
|
learning_rate: 1.0e-5
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|
|
@ -1,42 +0,0 @@
|
||||||
### model
|
|
||||||
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
|
||||||
|
|
||||||
### method
|
|
||||||
stage: sft
|
|
||||||
do_train: true
|
|
||||||
finetuning_type: full
|
|
||||||
use_badam: true
|
|
||||||
badam_mode: layer
|
|
||||||
badam_switch_mode: ascending
|
|
||||||
badam_switch_interval: 50
|
|
||||||
badam_verbose: 2
|
|
||||||
deepspeed: examples/deepspeed/ds_z3_config.json
|
|
||||||
|
|
||||||
### dataset
|
|
||||||
dataset: identity,alpaca_en_demo
|
|
||||||
template: llama3
|
|
||||||
cutoff_len: 1024
|
|
||||||
max_samples: 1000
|
|
||||||
overwrite_cache: true
|
|
||||||
preprocessing_num_workers: 16
|
|
||||||
|
|
||||||
### output
|
|
||||||
output_dir: saves/llama3-8b/full/sft
|
|
||||||
logging_steps: 10
|
|
||||||
save_steps: 500
|
|
||||||
plot_loss: true
|
|
||||||
overwrite_output_dir: true
|
|
||||||
|
|
||||||
### train
|
|
||||||
per_device_train_batch_size: 1
|
|
||||||
gradient_accumulation_steps: 8
|
|
||||||
learning_rate: 1.0e-4
|
|
||||||
num_train_epochs: 3.0
|
|
||||||
lr_scheduler_type: cosine
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
|
|
||||||
### eval
|
|
||||||
val_size: 0.1
|
|
||||||
per_device_eval_batch_size: 1
|
|
||||||
eval_strategy: steps
|
|
||||||
eval_steps: 500
|
|
|
@ -29,7 +29,7 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 1
|
gradient_accumulation_steps: 1
|
||||||
learning_rate: 1.0e-4
|
learning_rate: 1.0e-5
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|
|
@ -26,7 +26,7 @@ overwrite_output_dir: true
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
optim: paged_adamw_8bit
|
optim: paged_adamw_8bit
|
||||||
learning_rate: 1.0e-4
|
learning_rate: 1.0e-5
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|
|
@ -25,7 +25,7 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 2
|
gradient_accumulation_steps: 2
|
||||||
learning_rate: 1.0e-4
|
learning_rate: 1.0e-5
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
|
|
Loading…
Reference in New Issue