diff --git a/examples/extras/badam/llama3_badam_sft.yaml b/examples/extras/badam/llama3_badam_sft.yaml new file mode 100644 index 00000000..f5adb220 --- /dev/null +++ b/examples/extras/badam/llama3_badam_sft.yaml @@ -0,0 +1,40 @@ +### model +model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct + +### method +stage: sft +do_train: true +finetuning_type: full +use_badam: true +badam_switch_mode: ascending +badam_switch_interval: 50 +badam_verbose: 2 + +### dataset +dataset: identity,alpaca_en_demo +template: llama3 +cutoff_len: 1024 +max_samples: 1000 +overwrite_cache: true +preprocessing_num_workers: 16 + +### output +output_dir: saves/llama3-8b/full/sft +logging_steps: 10 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true + +### train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-6 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 + +### eval +val_size: 0.1 +per_device_eval_batch_size: 1 +eval_strategy: steps +eval_steps: 500 diff --git a/examples/extras/badam/train_single_gpu.sh b/examples/extras/badam/train_single_gpu.sh new file mode 100644 index 00000000..8af79007 --- /dev/null +++ b/examples/extras/badam/train_single_gpu.sh @@ -0,0 +1,37 @@ +#!/bin/bash +export CUDA_VISIBLE_DEVICES=0 + +cd ../../.. + +llamafactory-cli train \ + --stage sft \ + --do_train True \ + --model_name_or_path meta-llama/Llama-2-13b-hf \ + --preprocessing_num_workers 16 \ + --finetuning_type full \ + --template default \ + --flash_attn auto \ + --dataset_dir data \ + --dataset alpaca_en_demo \ + --cutoff_len 1024 \ + --learning_rate 1e-6 \ + --num_train_epochs 3.0 \ + --max_samples 100000 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --lr_scheduler_type cosine \ + --max_grad_norm 1.0 \ + --logging_steps 5 \ + --save_steps 100 \ + --warmup_steps 0 \ + --optim adamw_torch \ + --packing False \ + --report_to none \ + --use_badam True \ + --output_dir saves/LLaMA2-13B/full/BAdam \ + --plot_loss True \ + --ddp_timeout 180000000 \ + --include_num_input_tokens_seen True \ + --badam_mode layer \ + --badam_switch_mode ascending \ + --badam_switch_interval 50 \ No newline at end of file diff --git a/examples/extras/badam/train_zero3.sh b/examples/extras/badam/train_zero3.sh new file mode 100644 index 00000000..3b182134 --- /dev/null +++ b/examples/extras/badam/train_zero3.sh @@ -0,0 +1,39 @@ +#!/bin/bash +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +cd ../../.. + +llamafactory-cli train \ + --stage sft \ + --do_train True \ + --model_name_or_path meta-llama/Llama-2-13b-hf \ + --preprocessing_num_workers 16 \ + --finetuning_type full \ + --template default \ + --flash_attn auto \ + --dataset_dir data \ + --dataset alpaca_en_demo \ + --cutoff_len 1024 \ + --learning_rate 1e-6 \ + --num_train_epochs 3.0 \ + --max_samples 100000 \ + --per_device_train_batch_size 8 \ + --gradient_accumulation_steps 2 \ + --lr_scheduler_type cosine \ + --max_grad_norm 1.0 \ + --logging_steps 5 \ + --save_steps 100 \ + --warmup_steps 0 \ + --optim adamw_torch \ + --packing False \ + --report_to none \ + --use_badam True \ + --output_dir saves/LLaMA2-13B/full/BAdam \ + --fp16 True \ + --plot_loss True \ + --ddp_timeout 180000000 \ + --include_num_input_tokens_seen True \ + --badam_mode layer \ + --badam_switch_mode ascending \ + --badam_switch_interval 50 \ + --deepspeed cache/ds_z3_config.json \ No newline at end of file