diff --git a/examples/extras/badam/llama3_full_sft.yaml b/examples/extras/badam/llama3_full_sft.yaml index 31d61c33..5b91fe7e 100644 --- a/examples/extras/badam/llama3_full_sft.yaml +++ b/examples/extras/badam/llama3_full_sft.yaml @@ -10,6 +10,7 @@ badam_mode: layer badam_switch_mode: ascending badam_switch_interval: 50 badam_verbose: 2 +# deepspeed: examples/deepspeed/ds_z3_config.json ### dataset dataset: identity,alpaca_en_demo @@ -29,7 +30,7 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 8 -learning_rate: 1.0e-4 +learning_rate: 1.0e-5 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 diff --git a/examples/extras/badam/llama3_full_sft_ds3.yaml b/examples/extras/badam/llama3_full_sft_ds3.yaml deleted file mode 100644 index f2d7309f..00000000 --- a/examples/extras/badam/llama3_full_sft_ds3.yaml +++ /dev/null @@ -1,42 +0,0 @@ -### model -model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct - -### method -stage: sft -do_train: true -finetuning_type: full -use_badam: true -badam_mode: layer -badam_switch_mode: ascending -badam_switch_interval: 50 -badam_verbose: 2 -deepspeed: examples/deepspeed/ds_z3_config.json - -### dataset -dataset: identity,alpaca_en_demo -template: llama3 -cutoff_len: 1024 -max_samples: 1000 -overwrite_cache: true -preprocessing_num_workers: 16 - -### output -output_dir: saves/llama3-8b/full/sft -logging_steps: 10 -save_steps: 500 -plot_loss: true -overwrite_output_dir: true - -### train -per_device_train_batch_size: 1 -gradient_accumulation_steps: 8 -learning_rate: 1.0e-4 -num_train_epochs: 3.0 -lr_scheduler_type: cosine -warmup_ratio: 0.1 - -### eval -val_size: 0.1 -per_device_eval_batch_size: 1 -eval_strategy: steps -eval_steps: 500 diff --git a/examples/extras/galore/llama3_full_sft.yaml b/examples/extras/galore/llama3_full_sft.yaml index 605545de..efd2c124 100644 --- a/examples/extras/galore/llama3_full_sft.yaml +++ b/examples/extras/galore/llama3_full_sft.yaml @@ -29,7 +29,7 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 1 -learning_rate: 1.0e-4 +learning_rate: 1.0e-5 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 diff --git a/examples/extras/mod/llama3_full_sft.yaml b/examples/extras/mod/llama3_full_sft.yaml index 085febfc..f82bbd4c 100644 --- a/examples/extras/mod/llama3_full_sft.yaml +++ b/examples/extras/mod/llama3_full_sft.yaml @@ -26,7 +26,7 @@ overwrite_output_dir: true per_device_train_batch_size: 1 gradient_accumulation_steps: 8 optim: paged_adamw_8bit -learning_rate: 1.0e-4 +learning_rate: 1.0e-5 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1 diff --git a/examples/train_full/llama3_full_sft_ds3.yaml b/examples/train_full/llama3_full_sft_ds3.yaml index c983ad5c..c64596a1 100644 --- a/examples/train_full/llama3_full_sft_ds3.yaml +++ b/examples/train_full/llama3_full_sft_ds3.yaml @@ -25,7 +25,7 @@ overwrite_output_dir: true ### train per_device_train_batch_size: 1 gradient_accumulation_steps: 2 -learning_rate: 1.0e-4 +learning_rate: 1.0e-5 num_train_epochs: 3.0 lr_scheduler_type: cosine warmup_ratio: 0.1