tiny fix about badam

This commit is contained in:
hiyouga 2024-06-25 01:54:53 +08:00
parent efb81b25ec
commit 095fab58d3
17 changed files with 31 additions and 102 deletions

View File

@ -34,7 +34,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
python -m pip install .[torch,dev] python -m pip install ".[torch,dev]"
- name: Check quality - name: Check quality
run: | run: |

8
.gitignore vendored
View File

@ -160,8 +160,8 @@ cython_debug/
.idea/ .idea/
# custom .gitignore # custom .gitignore
user.config
saves/
cache/ cache/
wandb config/
ds_badam_exp saves/
output/
wandb/

Binary file not shown.

Before

Width:  |  Height:  |  Size: 198 KiB

After

Width:  |  Height:  |  Size: 141 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 194 KiB

After

Width:  |  Height:  |  Size: 149 KiB

View File

@ -6,6 +6,7 @@ stage: sft
do_train: true do_train: true
finetuning_type: full finetuning_type: full
use_badam: true use_badam: true
badam_mode: layer
badam_switch_mode: ascending badam_switch_mode: ascending
badam_switch_interval: 50 badam_switch_interval: 50
badam_verbose: 2 badam_verbose: 2
@ -32,7 +33,6 @@ learning_rate: 1.0e-4
num_train_epochs: 3.0 num_train_epochs: 3.0
lr_scheduler_type: cosine lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1
pure_bf16: true
### eval ### eval
val_size: 0.1 val_size: 0.1

View File

@ -6,9 +6,11 @@ stage: sft
do_train: true do_train: true
finetuning_type: full finetuning_type: full
use_badam: true use_badam: true
badam_mode: layer
badam_switch_mode: ascending badam_switch_mode: ascending
badam_switch_interval: 50 badam_switch_interval: 50
badam_verbose: 2 badam_verbose: 2
deepspeed: examples/deepspeed/ds_z3_config.json
### dataset ### dataset
dataset: identity,alpaca_en_demo dataset: identity,alpaca_en_demo
@ -28,7 +30,7 @@ overwrite_output_dir: true
### train ### train
per_device_train_batch_size: 1 per_device_train_batch_size: 1
gradient_accumulation_steps: 8 gradient_accumulation_steps: 8
learning_rate: 1.0e-6 learning_rate: 1.0e-4
num_train_epochs: 3.0 num_train_epochs: 3.0
lr_scheduler_type: cosine lr_scheduler_type: cosine
warmup_ratio: 0.1 warmup_ratio: 0.1

View File

@ -1,37 +0,0 @@
#!/bin/bash
export CUDA_VISIBLE_DEVICES=0
cd ../../..
llamafactory-cli train \
--stage sft \
--do_train True \
--model_name_or_path meta-llama/Llama-2-13b-hf \
--preprocessing_num_workers 16 \
--finetuning_type full \
--template default \
--flash_attn auto \
--dataset_dir data \
--dataset alpaca_en_demo \
--cutoff_len 1024 \
--learning_rate 1e-6 \
--num_train_epochs 3.0 \
--max_samples 100000 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 8 \
--lr_scheduler_type cosine \
--max_grad_norm 1.0 \
--logging_steps 5 \
--save_steps 100 \
--warmup_steps 0 \
--optim adamw_torch \
--packing False \
--report_to none \
--use_badam True \
--output_dir saves/LLaMA2-13B/full/BAdam \
--plot_loss True \
--ddp_timeout 180000000 \
--include_num_input_tokens_seen True \
--badam_mode layer \
--badam_switch_mode ascending \
--badam_switch_interval 50

View File

@ -1,39 +0,0 @@
#!/bin/bash
export CUDA_VISIBLE_DEVICES=0,1,2,3
cd ../../..
llamafactory-cli train \
--stage sft \
--do_train True \
--model_name_or_path meta-llama/Llama-2-13b-hf \
--preprocessing_num_workers 16 \
--finetuning_type full \
--template default \
--flash_attn auto \
--dataset_dir data \
--dataset alpaca_en_demo \
--cutoff_len 1024 \
--learning_rate 1e-6 \
--num_train_epochs 3.0 \
--max_samples 100000 \
--per_device_train_batch_size 8 \
--gradient_accumulation_steps 2 \
--lr_scheduler_type cosine \
--max_grad_norm 1.0 \
--logging_steps 5 \
--save_steps 100 \
--warmup_steps 0 \
--optim adamw_torch \
--packing False \
--report_to none \
--use_badam True \
--output_dir saves/LLaMA2-13B/full/BAdam \
--fp16 True \
--plot_loss True \
--ddp_timeout 180000000 \
--include_num_input_tokens_seen True \
--badam_mode layer \
--badam_switch_mode ascending \
--badam_switch_interval 50 \
--deepspeed cache/ds_z3_config.json

View File

@ -41,7 +41,7 @@ extra_require = {
"bitsandbytes": ["bitsandbytes>=0.39.0"], "bitsandbytes": ["bitsandbytes>=0.39.0"],
"vllm": ["vllm>=0.4.3"], "vllm": ["vllm>=0.4.3"],
"galore": ["galore-torch"], "galore": ["galore-torch"],
"badam": ["badam"], "badam": ["badam>=1.2.1"],
"gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"], "gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"],
"awq": ["autoawq"], "awq": ["autoawq"],
"aqlm": ["aqlm[gpu]>=1.1.0"], "aqlm": ["aqlm[gpu]>=1.1.0"],

View File

@ -121,7 +121,7 @@ def _check_extra_dependencies(
require_version("galore_torch", "To fix: pip install galore_torch") require_version("galore_torch", "To fix: pip install galore_torch")
if finetuning_args.use_badam: if finetuning_args.use_badam:
require_version("badam", "To fix: pip install badam") require_version("badam>=1.2.1", "To fix: pip install badam>=1.2.1")
if finetuning_args.plot_loss: if finetuning_args.plot_loss:
require_version("matplotlib", "To fix: pip install matplotlib") require_version("matplotlib", "To fix: pip install matplotlib")
@ -214,15 +214,15 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
if ( if (
finetuning_args.use_badam finetuning_args.use_badam
and training_args.parallel_mode.value == "distributed" and training_args.parallel_mode == ParallelMode.DISTRIBUTED
): ):
if finetuning_args.badam_mode == "ratio": if finetuning_args.badam_mode == "ratio":
raise ValueError("Ratio-wise BAdam does not yet support distributed training, use layer-wise BAdam: --badam_mode layer") raise ValueError("Radio-based BAdam does not yet support distributed training, use layer-wise BAdam.")
if finetuning_args.badam_mode == "layer" and (not is_deepspeed_zero3_enabled()): elif not is_deepspeed_zero3_enabled():
raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage.") raise ValueError("Layer-wise BAdam only supports DeepSpeed ZeRO-3 training.")
if (finetuning_args.use_galore) and training_args.deepspeed is not None: if finetuning_args.use_galore and training_args.deepspeed is not None:
raise ValueError("GaLore are incompatible with DeepSpeed yet.") raise ValueError("GaLore is incompatible with DeepSpeed yet.")
if model_args.infer_backend == "vllm": if model_args.infer_backend == "vllm":
raise ValueError("vLLM backend is only available for API, CLI and Web.") raise ValueError("vLLM backend is only available for API, CLI and Web.")

View File

@ -96,7 +96,8 @@ class CustomDPOTrainer(DPOTrainer):
self.save_model(os.path.join(self.args.output_dir, "pissa_init")) self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
if finetuning_args.use_badam: if finetuning_args.use_badam:
from badam import clip_grad_norm_old_version, BAdamCallback from badam import BAdamCallback, clip_grad_norm_old_version
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
self.callback_handler.add_callback(BAdamCallback) self.callback_handler.add_callback(BAdamCallback)

View File

@ -91,7 +91,8 @@ class CustomKTOTrainer(KTOTrainer):
self.ref_model.eval() self.ref_model.eval()
if finetuning_args.use_badam: if finetuning_args.use_badam:
from badam import clip_grad_norm_old_version, BAdamCallback from badam import BAdamCallback, clip_grad_norm_old_version
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
self.callback_handler.add_callback(BAdamCallback) self.callback_handler.add_callback(BAdamCallback)

View File

@ -166,7 +166,8 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True) self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True)
if finetuning_args.use_badam: if finetuning_args.use_badam:
from badam import clip_grad_norm_old_version, BAdamCallback from badam import BAdamCallback, clip_grad_norm_old_version
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
self.callback_handler.add_callback(BAdamCallback) self.callback_handler.add_callback(BAdamCallback)

View File

@ -48,7 +48,8 @@ class CustomTrainer(Trainer):
self.save_model(os.path.join(self.args.output_dir, "pissa_init")) self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
if finetuning_args.use_badam: if finetuning_args.use_badam:
from badam import clip_grad_norm_old_version, BAdamCallback from badam import BAdamCallback, clip_grad_norm_old_version
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
self.callback_handler.add_callback(BAdamCallback) self.callback_handler.add_callback(BAdamCallback)

View File

@ -72,7 +72,8 @@ class PairwiseTrainer(Trainer):
self.processor = processor self.processor = processor
self.can_return_loss = True # override property to return eval_loss self.can_return_loss = True # override property to return eval_loss
if finetuning_args.use_badam: if finetuning_args.use_badam:
from badam import clip_grad_norm_old_version, BAdamCallback from badam import BAdamCallback, clip_grad_norm_old_version
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
self.callback_handler.add_callback(BAdamCallback) self.callback_handler.add_callback(BAdamCallback)

View File

@ -56,7 +56,8 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
self.save_model(os.path.join(self.args.output_dir, "pissa_init")) self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
if finetuning_args.use_badam: if finetuning_args.use_badam:
from badam import clip_grad_norm_old_version, BAdamCallback from badam import BAdamCallback, clip_grad_norm_old_version
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
self.callback_handler.add_callback(BAdamCallback) self.callback_handler.add_callback(BAdamCallback)

View File

@ -23,6 +23,7 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
import torch import torch
from peft import PeftModel from peft import PeftModel
from transformers import Trainer from transformers import Trainer
from transformers.integrations import is_deepspeed_zero3_enabled
from transformers.optimization import get_scheduler from transformers.optimization import get_scheduler
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
from transformers.trainer_pt_utils import get_parameter_names from transformers.trainer_pt_utils import get_parameter_names
@ -372,9 +373,6 @@ def _create_badam_optimizer(
dict(params=decay_params, weight_decay=training_args.weight_decay), dict(params=decay_params, weight_decay=training_args.weight_decay),
] ]
from transformers.integrations import is_deepspeed_zero3_enabled
ds_zero3_enabled = is_deepspeed_zero3_enabled()
if finetuning_args.badam_mode == "layer": if finetuning_args.badam_mode == "layer":
from badam import BlockOptimizer from badam import BlockOptimizer
@ -387,7 +385,7 @@ def _create_badam_optimizer(
start_block=finetuning_args.badam_start_block, start_block=finetuning_args.badam_start_block,
switch_mode=finetuning_args.badam_switch_mode, switch_mode=finetuning_args.badam_switch_mode,
verbose=finetuning_args.badam_verbose, verbose=finetuning_args.badam_verbose,
ds_zero3_enabled=ds_zero3_enabled ds_zero3_enabled=is_deepspeed_zero3_enabled(),
) )
logger.info( logger.info(
f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, " f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, "
@ -398,7 +396,6 @@ def _create_badam_optimizer(
elif finetuning_args.badam_mode == "ratio": elif finetuning_args.badam_mode == "ratio":
from badam import BlockOptimizerRatio from badam import BlockOptimizerRatio
assert not ds_zero3_enabled, "BAdam with ratio-based update does not support Deepspeed ZeRO-3 yet, use layer-wise update instead: --badam_mode layer."
assert finetuning_args.badam_update_ratio > 1e-6 assert finetuning_args.badam_update_ratio > 1e-6
optimizer = BlockOptimizerRatio( optimizer = BlockOptimizerRatio(
param_groups=param_groups, param_groups=param_groups,