From 06c8908d3fe48907ddb585c5fa15677fc5416f94 Mon Sep 17 00:00:00 2001 From: Jonery Date: Mon, 15 Apr 2024 23:15:27 +0800 Subject: [PATCH 01/14] Feature BAdam --- examples/extras/badam/sft.sh | 36 ++++++++++++++++ requirements.txt | 1 + src/llmtuner/hparams/finetuning_args.py | 43 ++++++++++++++++++- src/llmtuner/hparams/parser.py | 6 +++ src/llmtuner/model/adapter.py | 6 +-- src/llmtuner/model/patcher.py | 5 ++- src/llmtuner/model/utils.py | 42 ++++++++++++++++++ src/llmtuner/train/sft/trainer.py | 6 ++- src/llmtuner/train/utils.py | 57 +++++++++++++++++++++++++ 9 files changed, 195 insertions(+), 7 deletions(-) create mode 100644 examples/extras/badam/sft.sh diff --git a/examples/extras/badam/sft.sh b/examples/extras/badam/sft.sh new file mode 100644 index 00000000..daa63913 --- /dev/null +++ b/examples/extras/badam/sft.sh @@ -0,0 +1,36 @@ +# BAdam layer-wise +export CUDA_VISIBLE_DEVICES=0 +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True +python ../../../src/train_bash.py \ +--stage sft \ +--do_train \ +--model_name_or_path meta-llama/Llama-2-7b-hf \ +--dataset alpaca_gpt4_en,glaive_toolcall \ +--dataset_dir ../../../data \ +--template default \ +--finetuning_type full \ +--output_dir ../../../saves/LLaMA2-7B/badam \ +--overwrite_cache \ +--overwrite_output_dir \ +--cutoff_len 1024 \ +--preprocessing_num_workers 32 \ +--per_device_train_batch_size 8 \ +--per_device_eval_batch_size 5 \ +--gradient_accumulation_steps 2 \ +--lr_scheduler_type cosine \ +--logging_steps 10 \ +--warmup_steps 20 \ +--save_steps 100 \ +--eval_steps 100 \ +--evaluation_strategy steps \ +--load_best_model_at_end \ +--learning_rate 5e-5 \ +--num_train_epochs 3.0 \ +--val_size 0.1 \ +--plot_loss \ +--use_badam \ +--switch_mode descending \ +--badam_verbose 2 \ +--switch_block_every 50 \ +--pure_bf16 \ + diff --git a/requirements.txt b/requirements.txt index 1fa5a142..9d58d75a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,4 @@ fastapi sse-starlette matplotlib fire +badam \ No newline at end of file diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py index 177a9f8a..d64f1583 100644 --- a/src/llmtuner/hparams/finetuning_args.py +++ b/src/llmtuner/hparams/finetuning_args.py @@ -163,6 +163,47 @@ class RLHFArguments: metadata={"help": "The type of the reward model in PPO training. Lora model only supports lora training."}, ) +@dataclass +class BAdamArgument: + r""" + Arguments for BAdam optimizer. + """ + use_badam: bool = field( + default=False, + metadata={"help": "Whether or not to use BAdam optimizer."}, + ) + badam_mode: Literal["layer", "ratio"] = field( + default="layer", + metadata={"help": "The mode of BAdam optimizer. 'layer' for layer-wise, 'ratio' for ratio-wise."}, + ) + + # ======== Arguments for layer-wise update ======== + start_block: Optional[int] = field( + default=None, + metadata={"help": "The starting block index for block-wise fine-tuning."} + ) + switch_block_every: Optional[int] = field( + default=50, + metadata={"help": "how often to switch model's block update. Set to -1 to disable the block update."} + ) + switch_mode: Optional[Literal["ascending", "descending", "random", "fixed"]] = field( + default="ascending", + metadata={"help": "the strategy of picking block to update."} + ) + + # ======== Arguments for ratio-wise update ======== + badam_update_ratio: float = field( + default=0., + metadata={"help": "The ratio of the update for the BAdam optimizer."} + ) + badam_mask_mode: Literal["adjacent", "scatter"] = field( + default="adjacent", + metadata={"help": "The mode of the mask for BAdam optimizer. `adjacent` means that the trainable parameters are adjacent to each other; `scatter` means that trainable parameters are randomly choosed from the weight."} + ) + badam_verbose: int = field( + default=0, + metadata={"help": "The verbosity level of BAdam optimizer. 0 for no print, 1 for print the block prefix, 2 for print trainable parameters"} + ) @dataclass class GaloreArguments: @@ -204,7 +245,7 @@ class GaloreArguments: @dataclass -class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreArguments): +class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreArguments, BAdamArgument): r""" Arguments pertaining to which techniques we are going to fine-tuning with. """ diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py index 8f3bd18a..032a1a4b 100644 --- a/src/llmtuner/hparams/parser.py +++ b/src/llmtuner/hparams/parser.py @@ -171,6 +171,12 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: if finetuning_args.use_galore and training_args.deepspeed is not None: raise ValueError("GaLore is incompatible with DeepSpeed.") + if (finetuning_args.use_badam + and finetuning_args.badam_mode == "layer" + and training_args.parallel_mode.value == "distributed" + ): + raise ValueError("BAdam with layer-wise mode is not supported in distributed training by now, use ratio mode instead.") + if model_args.infer_backend == "vllm": raise ValueError("vLLM backend is only available for API, CLI and Web.") diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py index 4bb4057d..71c9c2f4 100644 --- a/src/llmtuner/model/adapter.py +++ b/src/llmtuner/model/adapter.py @@ -37,7 +37,7 @@ def init_adapter( if finetuning_args.finetuning_type == "full" and is_trainable: logger.info("Fine-tuning method: Full") - if not finetuning_args.pure_bf16: + if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam): model = model.float() if finetuning_args.finetuning_type == "freeze" and is_trainable: @@ -82,7 +82,7 @@ def init_adapter( for name, param in model.named_parameters(): if any(trainable_layer in name for trainable_layer in trainable_layers): - if not finetuning_args.pure_bf16: + if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam): param.data = param.data.to(torch.float32) else: param.requires_grad_(False) @@ -162,7 +162,7 @@ def init_adapter( ) model = get_peft_model(model, lora_config) - if not finetuning_args.pure_bf16: + if (not finetuning_args.pure_bf16) and (not finetuning_args.use_badam): for param in filter(lambda p: p.requires_grad, model.parameters()): param.data = param.data.to(torch.float32) diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py index c48df995..563b1827 100644 --- a/src/llmtuner/model/patcher.py +++ b/src/llmtuner/model/patcher.py @@ -17,7 +17,7 @@ from ..extras.logging import get_logger from ..extras.misc import get_current_device, infer_optim_dtype from ..extras.packages import is_flash_attn2_available from ..extras.patches.llama_patch import apply_llama_patch -from .utils import QuantizationMethod, add_z3_leaf_module +from .utils import QuantizationMethod, add_z3_leaf_module, gradient_checkpointing_enable if TYPE_CHECKING: @@ -266,8 +266,9 @@ def _prepare_model_for_training( else: # use_reentrant=False might increase VRAM usage (have not been empirically verified yet) # According to: https://github.com/huggingface/transformers/issues/28339 + model.gradient_checkpointing_enable = MethodType(gradient_checkpointing_enable, model) model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": True}) - model.enable_input_require_grads() + # model.enable_input_require_grads() setattr(model.config, "use_cache", False) # turn off when gradient checkpointing is enabled logger.info("Gradient checkpointing enabled.") diff --git a/src/llmtuner/model/utils.py b/src/llmtuner/model/utils.py index 771e6112..e83a903e 100644 --- a/src/llmtuner/model/utils.py +++ b/src/llmtuner/model/utils.py @@ -135,3 +135,45 @@ def register_autoclass(config: "PretrainedConfig", model: "PreTrainedModel", tok model.__class__.register_for_auto_class() if "AutoTokenizer" in tokenizer.init_kwargs.get("auto_map", {}): tokenizer.__class__.register_for_auto_class() + +def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None): + """ + Modification of the original method to enable gradient checkpointing for block-wise optimizer. + + Activates gradient checkpointing for the current model. + + We pass the `__call__` method of the modules instead of `forward` because `__call__` attaches all the hooks of + the module. https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2 + + Args: + gradient_checkpointing_kwargs (dict, *optional*): + Additional keyword arguments passed along to the `torch.utils.checkpoint.checkpoint` function. + """ + from torch.utils.checkpoint import checkpoint + + if not self.supports_gradient_checkpointing: + raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.") + + if gradient_checkpointing_kwargs is None: + gradient_checkpointing_kwargs = {} + + # gradient_checkpointing_func = functools.partial(checkpoint, **gradient_checkpointing_kwargs) + + def gradient_checkpointing_func(func, *args, **kwargs): + module = func.__self__ + + if any([p.requires_grad for p in module.parameters()]): + for arg in args: + if torch.is_tensor(arg) and torch.is_floating_point(arg): + arg.requires_grad_(True) + + return checkpoint(func, *args, **kwargs) + + self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func) + + if getattr(self, "_hf_peft_config_loaded", False): + # When using PEFT + gradient checkpointing + Trainer we need to make sure the input has requires_grad=True + # we do it also on PEFT: https://github.com/huggingface/peft/blob/85013987aa82aa1af3da1236b6902556ce3e483e/src/peft/peft_model.py#L334 + # When training with PEFT, only LoRA layers will have requires grad set to True, but the output of frozen layers need to propagate + # the gradients to make sure the gradient flows. + self.enable_input_require_grads() \ No newline at end of file diff --git a/src/llmtuner/train/sft/trainer.py b/src/llmtuner/train/sft/trainer.py index 8d2f9fa0..d750f491 100644 --- a/src/llmtuner/train/sft/trainer.py +++ b/src/llmtuner/train/sft/trainer.py @@ -9,7 +9,8 @@ from transformers import Seq2SeqTrainer from ...extras.constants import IGNORE_INDEX from ...extras.logging import get_logger from ..utils import create_custom_optimzer, create_custom_scheduler - +from types import MethodType +from packaging import version if TYPE_CHECKING: from transformers.trainer import PredictionOutput @@ -28,6 +29,9 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer): def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None: super().__init__(**kwargs) self.finetuning_args = finetuning_args + if version.parse(torch.__version__) >= version.parse("1.13"): + from badam import clip_grad_norm_for_sparse_tensor + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) def create_optimizer(self) -> "torch.optim.Optimizer": if self.optimizer is None: diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py index d921aec4..65233f72 100644 --- a/src/llmtuner/train/utils.py +++ b/src/llmtuner/train/utils.py @@ -287,12 +287,69 @@ def _create_loraplus_optimizer( logger.info("Using LoRA+ optimizer with loraplus lr ratio {:.2f}.".format(finetuning_args.loraplus_lr_ratio)) return optimizer +def _create_badam_optimizer( + model: "PreTrainedModel", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", +) -> "torch.optim.Optimizer": + + from transformers.trainer_pt_utils import get_parameter_names + decay_parameters = list(filter(lambda n: "bias" not in n, get_parameter_names(model, ALL_LAYERNORM_LAYERS))) + # filter out the embedding layers when using badam ratio mode + if finetuning_args.badam_mode == "ratio": + decay_parameters = list(filter(lambda n: "embed" not in n, decay_parameters)) # TODO: make it more general + optimizer_grouped_parameters = [ + { + "params": [p for n, p in model.named_parameters() if n in decay_parameters], + "weight_decay": training_args.weight_decay, + }, + { + "params": [p for n, p in model.named_parameters() if n not in decay_parameters], + "weight_decay": 0.0, + }, + ] + + optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args) + + # create BlockOptimizer + if finetuning_args.badam_mode == "layer": + from badam import BlockOptimizer + base_optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) + optimizer = BlockOptimizer(base_optimizer=base_optimizer, + named_parameters_list=list(model.named_parameters()), + block_prefix_list=None, + switch_block_every=finetuning_args.switch_block_every, + start_block=finetuning_args.start_block, + switch_mode=finetuning_args.switch_mode, + verbose=finetuning_args.badam_verbose) + + logger.info(f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.switch_mode}, " + f"switch block every {finetuning_args.switch_block_every} steps, " + f"default start block is {finetuning_args.start_block}") + + elif finetuning_args.badam_mode == "ratio": + assert finetuning_args.badam_update_ratio > 0. + from badam import BlockOptimizerRatio + optimizer = BlockOptimizerRatio(param_groups=optimizer_grouped_parameters, + named_parameters_list=list(model.named_parameters()), + update_ratio=finetuning_args.badam_update_ratio, + mask_mode=finetuning_args.badam_mask_mode, + verbose=finetuning_args.badam_verbose, + **optimizer_kwargs) + + logger.info(f"Using BAdam optimizer with ratio update, update ratio is {finetuning_args.badam_update_ratio}, " + f"mask mode is {finetuning_args.badam_mask_mode}") + + return optimizer def create_custom_optimzer( model: "PreTrainedModel", training_args: "Seq2SeqTrainingArguments", finetuning_args: "FinetuningArguments", ) -> Optional["torch.optim.Optimizer"]: + if finetuning_args.use_badam: + return _create_badam_optimizer(model, training_args, finetuning_args) + if finetuning_args.use_galore: return _create_galore_optimizer(model, training_args, finetuning_args) From 7ecb61822b37f5d71060d696495830ff98edaa06 Mon Sep 17 00:00:00 2001 From: Jonery Date: Tue, 16 Apr 2024 12:05:27 +0800 Subject: [PATCH 02/14] resolve gradient checkpointing issue. --- examples/extras/badam/sft.sh | 3 +-- setup.py | 1 + src/llmtuner/model/utils.py | 16 +++++----------- src/llmtuner/train/sft/trainer.py | 2 +- 4 files changed, 8 insertions(+), 14 deletions(-) diff --git a/examples/extras/badam/sft.sh b/examples/extras/badam/sft.sh index daa63913..656cfdba 100644 --- a/examples/extras/badam/sft.sh +++ b/examples/extras/badam/sft.sh @@ -31,6 +31,5 @@ python ../../../src/train_bash.py \ --use_badam \ --switch_mode descending \ --badam_verbose 2 \ ---switch_block_every 50 \ ---pure_bf16 \ +--switch_block_every 50 diff --git a/setup.py b/setup.py index fd5bdf7e..b2eb4afd 100644 --- a/setup.py +++ b/setup.py @@ -24,6 +24,7 @@ extra_require = { "metrics": ["nltk", "jieba", "rouge-chinese"], "unsloth": ["torch==2.2.0", "unsloth[cu121-ampere-torch220]"], "galore": ["galore-torch"], + "badam": ["torch>=2.1.0"], "vllm": ["vllm>=0.3.3"], "bitsandbytes": ["bitsandbytes>=0.39.0"], "gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"], diff --git a/src/llmtuner/model/utils.py b/src/llmtuner/model/utils.py index e83a903e..fd587efd 100644 --- a/src/llmtuner/model/utils.py +++ b/src/llmtuner/model/utils.py @@ -150,30 +150,24 @@ def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None): Additional keyword arguments passed along to the `torch.utils.checkpoint.checkpoint` function. """ from torch.utils.checkpoint import checkpoint + import functools if not self.supports_gradient_checkpointing: raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.") if gradient_checkpointing_kwargs is None: - gradient_checkpointing_kwargs = {} + gradient_checkpointing_kwargs = {"use_reentrant": True} - # gradient_checkpointing_func = functools.partial(checkpoint, **gradient_checkpointing_kwargs) + checkpoint = functools.partial(checkpoint, **gradient_checkpointing_kwargs) def gradient_checkpointing_func(func, *args, **kwargs): module = func.__self__ - if any([p.requires_grad for p in module.parameters()]): + if any(p.requires_grad for p in module.parameters()): for arg in args: if torch.is_tensor(arg) and torch.is_floating_point(arg): arg.requires_grad_(True) return checkpoint(func, *args, **kwargs) - self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func) - - if getattr(self, "_hf_peft_config_loaded", False): - # When using PEFT + gradient checkpointing + Trainer we need to make sure the input has requires_grad=True - # we do it also on PEFT: https://github.com/huggingface/peft/blob/85013987aa82aa1af3da1236b6902556ce3e483e/src/peft/peft_model.py#L334 - # When training with PEFT, only LoRA layers will have requires grad set to True, but the output of frozen layers need to propagate - # the gradients to make sure the gradient flows. - self.enable_input_require_grads() \ No newline at end of file + self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func) \ No newline at end of file diff --git a/src/llmtuner/train/sft/trainer.py b/src/llmtuner/train/sft/trainer.py index d750f491..de741426 100644 --- a/src/llmtuner/train/sft/trainer.py +++ b/src/llmtuner/train/sft/trainer.py @@ -29,7 +29,7 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer): def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None: super().__init__(**kwargs) self.finetuning_args = finetuning_args - if version.parse(torch.__version__) >= version.parse("1.13"): + if finetuning_args.use_badam: from badam import clip_grad_norm_for_sparse_tensor self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) From 86900210d32e0d4cd9c6fb411a3692178411e970 Mon Sep 17 00:00:00 2001 From: Jonery Date: Tue, 16 Apr 2024 12:25:50 +0800 Subject: [PATCH 03/14] remove badam from core requirements --- requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9d58d75a..4b5651b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,5 +14,4 @@ pydantic fastapi sse-starlette matplotlib -fire -badam \ No newline at end of file +fire \ No newline at end of file From f4b4a26c6435aa7fdccd11fb92795b8b26dd989a Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 16 Apr 2024 17:10:02 +0800 Subject: [PATCH 04/14] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b2eb4afd..9ef881e2 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ extra_require = { "metrics": ["nltk", "jieba", "rouge-chinese"], "unsloth": ["torch==2.2.0", "unsloth[cu121-ampere-torch220]"], "galore": ["galore-torch"], - "badam": ["torch>=2.1.0"], + "badam": ["badam"], "vllm": ["vllm>=0.3.3"], "bitsandbytes": ["bitsandbytes>=0.39.0"], "gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"], From 50ced257e0a1737dbd73d569f8f706ff452f70aa Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 16 Apr 2024 17:10:17 +0800 Subject: [PATCH 05/14] Update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4b5651b4..1fa5a142 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,4 @@ pydantic fastapi sse-starlette matplotlib -fire \ No newline at end of file +fire From 57dcd91e17833a0eeb8d99af92ac73c132a77648 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 16 Apr 2024 17:25:40 +0800 Subject: [PATCH 06/14] Update sft.sh --- examples/extras/badam/sft.sh | 68 ++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/examples/extras/badam/sft.sh b/examples/extras/badam/sft.sh index 656cfdba..c2319caa 100644 --- a/examples/extras/badam/sft.sh +++ b/examples/extras/badam/sft.sh @@ -1,35 +1,35 @@ -# BAdam layer-wise -export CUDA_VISIBLE_DEVICES=0 -export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True -python ../../../src/train_bash.py \ ---stage sft \ ---do_train \ ---model_name_or_path meta-llama/Llama-2-7b-hf \ ---dataset alpaca_gpt4_en,glaive_toolcall \ ---dataset_dir ../../../data \ ---template default \ ---finetuning_type full \ ---output_dir ../../../saves/LLaMA2-7B/badam \ ---overwrite_cache \ ---overwrite_output_dir \ ---cutoff_len 1024 \ ---preprocessing_num_workers 32 \ ---per_device_train_batch_size 8 \ ---per_device_eval_batch_size 5 \ ---gradient_accumulation_steps 2 \ ---lr_scheduler_type cosine \ ---logging_steps 10 \ ---warmup_steps 20 \ ---save_steps 100 \ ---eval_steps 100 \ ---evaluation_strategy steps \ ---load_best_model_at_end \ ---learning_rate 5e-5 \ ---num_train_epochs 3.0 \ ---val_size 0.1 \ ---plot_loss \ ---use_badam \ ---switch_mode descending \ ---badam_verbose 2 \ ---switch_block_every 50 +#!/bin/bash +CUDA_VISIBLE_DEVICES=0 python ../../../src/train_bash.py \ + --stage sft \ + --do_train \ + --model_name_or_path meta-llama/Llama-2-7b-hf \ + --dataset alpaca_gpt4_en,glaive_toolcall \ + --dataset_dir ../../../data \ + --template default \ + --finetuning_type full \ + --use_badam \ + --badam_switch_mode descending \ + --badam_switch_block_every 50 \ + --badam_verbose 2 \ + --output_dir ../../../saves/LLaMA2-7B/badam/sft \ + --overwrite_cache \ + --overwrite_output_dir \ + --cutoff_len 1024 \ + --preprocessing_num_workers 16 \ + --per_device_train_batch_size 1 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --lr_scheduler_type cosine \ + --logging_steps 10 \ + --warmup_steps 20 \ + --save_steps 100 \ + --eval_steps 100 \ + --evaluation_strategy steps \ + --load_best_model_at_end \ + --learning_rate 5e-5 \ + --num_train_epochs 3.0 \ + --max_samples 3000 \ + --val_size 0.1 \ + --plot_loss \ + --pure_bf16 From ec899cccf3b8710510e496a3cd8e4c302bb99a19 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 16 Apr 2024 17:26:30 +0800 Subject: [PATCH 07/14] Update finetuning_args.py --- src/llmtuner/hparams/finetuning_args.py | 90 +++++++++++++------------ 1 file changed, 48 insertions(+), 42 deletions(-) diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py index d64f1583..899c7284 100644 --- a/src/llmtuner/hparams/finetuning_args.py +++ b/src/llmtuner/hparams/finetuning_args.py @@ -163,47 +163,6 @@ class RLHFArguments: metadata={"help": "The type of the reward model in PPO training. Lora model only supports lora training."}, ) -@dataclass -class BAdamArgument: - r""" - Arguments for BAdam optimizer. - """ - use_badam: bool = field( - default=False, - metadata={"help": "Whether or not to use BAdam optimizer."}, - ) - badam_mode: Literal["layer", "ratio"] = field( - default="layer", - metadata={"help": "The mode of BAdam optimizer. 'layer' for layer-wise, 'ratio' for ratio-wise."}, - ) - - # ======== Arguments for layer-wise update ======== - start_block: Optional[int] = field( - default=None, - metadata={"help": "The starting block index for block-wise fine-tuning."} - ) - switch_block_every: Optional[int] = field( - default=50, - metadata={"help": "how often to switch model's block update. Set to -1 to disable the block update."} - ) - switch_mode: Optional[Literal["ascending", "descending", "random", "fixed"]] = field( - default="ascending", - metadata={"help": "the strategy of picking block to update."} - ) - - # ======== Arguments for ratio-wise update ======== - badam_update_ratio: float = field( - default=0., - metadata={"help": "The ratio of the update for the BAdam optimizer."} - ) - badam_mask_mode: Literal["adjacent", "scatter"] = field( - default="adjacent", - metadata={"help": "The mode of the mask for BAdam optimizer. `adjacent` means that the trainable parameters are adjacent to each other; `scatter` means that trainable parameters are randomly choosed from the weight."} - ) - badam_verbose: int = field( - default=0, - metadata={"help": "The verbosity level of BAdam optimizer. 0 for no print, 1 for print the block prefix, 2 for print trainable parameters"} - ) @dataclass class GaloreArguments: @@ -213,7 +172,7 @@ class GaloreArguments: use_galore: bool = field( default=False, - metadata={"help": "Whether or not to use gradient low-Rank projection."}, + metadata={"help": "Whether or not to use the gradient low-Rank projection (GaLore)."}, ) galore_target: str = field( default="all", @@ -244,6 +203,53 @@ class GaloreArguments: ) +@dataclass +class BAdamArgument: + r""" + Arguments pertaining to the BAdam optimizer. + """ + + use_badam: bool = field( + default=False, + metadata={"help": "Whether or not to use the BAdam optimizer."}, + ) + badam_mode: Literal["layer", "ratio"] = field( + default="layer", + metadata={"help": "Whether to use layer-wise or ratio-wise BAdam optimizer."}, + ) + badam_start_block: Optional[int] = field( + default=None, + metadata={"help": "The starting block index for layer-wise BAdam."}, + ) + badam_switch_block_every: Optional[int] = field( + default=50, + metadata={"help": "How often to switch model's block update. Set to -1 to disable the block update."}, + ) + badam_switch_mode: Optional[Literal["ascending", "descending", "random", "fixed"]] = field( + default="ascending", + metadata={"help": "the strategy of picking block to update for layer-wise BAdam."}, + ) + badam_update_ratio: float = field( + default=0.0, + metadata={"help": "The ratio of the update for ratio-wise BAdam."}, + ) + badam_mask_mode: Literal["adjacent", "scatter"] = field( + default="adjacent", + metadata={ + "help": """The mode of the mask for BAdam optimizer. \ + `adjacent` means that the trainable parameters are adjacent to each other, \ + `scatter` means that trainable parameters are randomly choosed from the weight.""" + }, + ) + badam_verbose: int = field( + default=0, + metadata={ + "help": """The verbosity level of BAdam optimizer. \ + 0 for no print, 1 for print the block prefix, 2 for print trainable parameters""" + }, + ) + + @dataclass class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreArguments, BAdamArgument): r""" From 5b59ff421204115ced405a2e3d56ac0ee8c5b788 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 16 Apr 2024 17:27:02 +0800 Subject: [PATCH 08/14] Update parser.py --- src/llmtuner/hparams/parser.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py index 032a1a4b..baa65978 100644 --- a/src/llmtuner/hparams/parser.py +++ b/src/llmtuner/hparams/parser.py @@ -82,12 +82,18 @@ def _check_extra_dependencies( if model_args.use_unsloth: require_version("unsloth", "Please install unsloth: https://github.com/unslothai/unsloth") + if model_args.mixture_of_depths: + require_version("mixture-of-depth", "To fix: pip install mixture-of-depth") + if model_args.infer_backend == "vllm": require_version("vllm>=0.3.3", "To fix: pip install vllm>=0.3.3") if finetuning_args.use_galore: require_version("galore_torch", "To fix: pip install galore_torch") + if finetuning_args.use_badam: + require_version("badam", "To fix: pip install badam") + if training_args is not None and training_args.predict_with_generate: require_version("jieba", "To fix: pip install jieba") require_version("nltk", "To fix: pip install nltk") @@ -151,6 +157,9 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: if training_args.do_train and training_args.predict_with_generate: raise ValueError("`predict_with_generate` cannot be set as True while training.") + if training_args.do_train and model_args.quantization_device_map == "auto": + raise ValueError("Cannot use device map for quantized models in training.") + if finetuning_args.use_dora and model_args.use_unsloth: raise ValueError("Unsloth does not support DoRA.") @@ -169,14 +178,15 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: raise ValueError("Distributed training does not support layer-wise GaLore.") if finetuning_args.use_galore and training_args.deepspeed is not None: - raise ValueError("GaLore is incompatible with DeepSpeed.") + raise ValueError("GaLore is incompatible with DeepSpeed yet.") - if (finetuning_args.use_badam + if ( + finetuning_args.use_badam and finetuning_args.badam_mode == "layer" and training_args.parallel_mode.value == "distributed" ): - raise ValueError("BAdam with layer-wise mode is not supported in distributed training by now, use ratio mode instead.") - + raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.") + if model_args.infer_backend == "vllm": raise ValueError("vLLM backend is only available for API, CLI and Web.") From 4660703674233949a8ba8c76bdb17dafc9d620d4 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 16 Apr 2024 17:27:25 +0800 Subject: [PATCH 09/14] Update parser.py --- src/llmtuner/hparams/parser.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py index baa65978..a9f8ffd7 100644 --- a/src/llmtuner/hparams/parser.py +++ b/src/llmtuner/hparams/parser.py @@ -82,9 +82,6 @@ def _check_extra_dependencies( if model_args.use_unsloth: require_version("unsloth", "Please install unsloth: https://github.com/unslothai/unsloth") - if model_args.mixture_of_depths: - require_version("mixture-of-depth", "To fix: pip install mixture-of-depth") - if model_args.infer_backend == "vllm": require_version("vllm>=0.3.3", "To fix: pip install vllm>=0.3.3") From 750cdf2e74097c8775d03ddf55646cc14d4a686f Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 16 Apr 2024 17:28:12 +0800 Subject: [PATCH 10/14] Update adapter.py --- src/llmtuner/model/adapter.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py index 71c9c2f4..b712bdcf 100644 --- a/src/llmtuner/model/adapter.py +++ b/src/llmtuner/model/adapter.py @@ -145,18 +145,22 @@ def init_adapter( "lora_alpha": finetuning_args.lora_alpha, "lora_dropout": finetuning_args.lora_dropout, "use_rslora": finetuning_args.use_rslora, + "modules_to_save": finetuning_args.additional_target, } if model_args.use_unsloth: from unsloth import FastLanguageModel # type: ignore - unsloth_peft_kwargs = {"model": model, "max_seq_length": model_args.model_max_length} + unsloth_peft_kwargs = { + "model": model, + "max_seq_length": model_args.model_max_length, + "use_gradient_checkpointing": "unsloth", + } model = FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs) else: lora_config = LoraConfig( task_type=TaskType.CAUSAL_LM, inference_mode=False, - modules_to_save=finetuning_args.additional_target, use_dora=finetuning_args.use_dora, **peft_kwargs, ) From a950f3b81de701f5f23ce3efa60ff0382bb40dfe Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 16 Apr 2024 17:29:19 +0800 Subject: [PATCH 11/14] Update patcher.py --- src/llmtuner/model/patcher.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py index 563b1827..fb2835e8 100644 --- a/src/llmtuner/model/patcher.py +++ b/src/llmtuner/model/patcher.py @@ -133,7 +133,9 @@ def _configure_quantization( if is_deepspeed_zero3_enabled(): raise ValueError("DeepSpeed ZeRO-3 is incompatible with quantized models.") - init_kwargs["device_map"] = {"": get_current_device()} + if model_args.quantization_device_map != "auto": + init_kwargs["device_map"] = {"": get_current_device()} + quantization_config: Dict[str, Any] = getattr(config, "quantization_config", None) quant_method = quantization_config.get("quant_method", "") @@ -268,7 +270,6 @@ def _prepare_model_for_training( # According to: https://github.com/huggingface/transformers/issues/28339 model.gradient_checkpointing_enable = MethodType(gradient_checkpointing_enable, model) model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": True}) - # model.enable_input_require_grads() setattr(model.config, "use_cache", False) # turn off when gradient checkpointing is enabled logger.info("Gradient checkpointing enabled.") From 38a56706e0f52297501d351d38b51bee73e881dc Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 16 Apr 2024 17:29:30 +0800 Subject: [PATCH 12/14] Update utils.py --- src/llmtuner/model/utils.py | 70 +++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 37 deletions(-) diff --git a/src/llmtuner/model/utils.py b/src/llmtuner/model/utils.py index fd587efd..7e4430d1 100644 --- a/src/llmtuner/model/utils.py +++ b/src/llmtuner/model/utils.py @@ -1,5 +1,6 @@ from enum import Enum, unique -from typing import TYPE_CHECKING, Dict, List +from functools import partial +from typing import TYPE_CHECKING, Any, Dict, List, Optional import torch from transformers import PreTrainedModel @@ -100,6 +101,37 @@ def find_expanded_modules(model: "PreTrainedModel", target_modules: List[str], n return module_names +def gradient_checkpointing_enable( + self: "PreTrainedModel", gradient_checkpointing_kwargs: Optional[Dict[str, Any]] = None +) -> None: + r""" + Activates gradient checkpointing for the current model. + + Modification of the original method to enable gradient checkpointing for block-wise optimizer. + """ + from torch.utils.checkpoint import checkpoint + + if not self.supports_gradient_checkpointing: + raise ValueError("{} does not support gradient checkpointing.".format(self.__class__.__name__)) + + if gradient_checkpointing_kwargs is None: + gradient_checkpointing_kwargs = {"use_reentrant": True} + + gradient_checkpointing_func = partial(checkpoint, **gradient_checkpointing_kwargs) + + def custom_gradient_checkpointing_func(func, *args, **kwargs): + module: "torch.nn.Module" = func.__self__ + + if any(param.requires_grad for param in module.parameters()): + for arg in args: + if torch.is_tensor(arg) and torch.is_floating_point(arg): + arg.requires_grad_(True) + + return gradient_checkpointing_func(func, *args, **kwargs) + + self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=custom_gradient_checkpointing_func) + + def load_valuehead_params(path_or_repo_id: str, model_args: "ModelArguments") -> Dict[str, torch.Tensor]: r""" Loads value head parameters from Hugging Face Hub or local disk. @@ -135,39 +167,3 @@ def register_autoclass(config: "PretrainedConfig", model: "PreTrainedModel", tok model.__class__.register_for_auto_class() if "AutoTokenizer" in tokenizer.init_kwargs.get("auto_map", {}): tokenizer.__class__.register_for_auto_class() - -def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None): - """ - Modification of the original method to enable gradient checkpointing for block-wise optimizer. - - Activates gradient checkpointing for the current model. - - We pass the `__call__` method of the modules instead of `forward` because `__call__` attaches all the hooks of - the module. https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2 - - Args: - gradient_checkpointing_kwargs (dict, *optional*): - Additional keyword arguments passed along to the `torch.utils.checkpoint.checkpoint` function. - """ - from torch.utils.checkpoint import checkpoint - import functools - - if not self.supports_gradient_checkpointing: - raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.") - - if gradient_checkpointing_kwargs is None: - gradient_checkpointing_kwargs = {"use_reentrant": True} - - checkpoint = functools.partial(checkpoint, **gradient_checkpointing_kwargs) - - def gradient_checkpointing_func(func, *args, **kwargs): - module = func.__self__ - - if any(p.requires_grad for p in module.parameters()): - for arg in args: - if torch.is_tensor(arg) and torch.is_floating_point(arg): - arg.requires_grad_(True) - - return checkpoint(func, *args, **kwargs) - - self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func) \ No newline at end of file From 6700a1b9fa0cbd965ac45d3f2de1088727235c25 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 16 Apr 2024 17:29:52 +0800 Subject: [PATCH 13/14] Update trainer.py --- src/llmtuner/train/sft/trainer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llmtuner/train/sft/trainer.py b/src/llmtuner/train/sft/trainer.py index de741426..def427fd 100644 --- a/src/llmtuner/train/sft/trainer.py +++ b/src/llmtuner/train/sft/trainer.py @@ -1,5 +1,6 @@ import json import os +from types import MethodType from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import numpy as np @@ -9,8 +10,7 @@ from transformers import Seq2SeqTrainer from ...extras.constants import IGNORE_INDEX from ...extras.logging import get_logger from ..utils import create_custom_optimzer, create_custom_scheduler -from types import MethodType -from packaging import version + if TYPE_CHECKING: from transformers.trainer import PredictionOutput @@ -31,6 +31,7 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer): self.finetuning_args = finetuning_args if finetuning_args.use_badam: from badam import clip_grad_norm_for_sparse_tensor + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) def create_optimizer(self) -> "torch.optim.Optimizer": From c9828f4c6e6c150c884e02d0213dff0c09801e77 Mon Sep 17 00:00:00 2001 From: hoshi-hiyouga Date: Tue, 16 Apr 2024 17:30:12 +0800 Subject: [PATCH 14/14] Update utils.py --- src/llmtuner/train/utils.py | 111 +++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 47 deletions(-) diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py index 65233f72..2835eddf 100644 --- a/src/llmtuner/train/utils.py +++ b/src/llmtuner/train/utils.py @@ -162,6 +162,15 @@ def _get_decay_parameter_names(model: "PreTrainedModel") -> List[str]: return decay_parameters +def _get_embedding_names(model: "PreTrainedModel") -> List[str]: + r""" + Returns a list of names of parameters in embedding. + """ + result = {name for name, _ in model.get_input_embeddings().named_parameters()} + result.update(name for name, _ in model.get_output_embeddings().named_parameters()) + return result + + def _create_galore_optimizer( model: "PreTrainedModel", training_args: "Seq2SeqTrainingArguments", @@ -236,7 +245,7 @@ def _create_galore_optimizer( optimizer = DummyOptimizer(lr=training_args.learning_rate, optimizer_dict=optimizer_dict) else: param_groups = [ - dict(params=nodecay_params), + dict(params=nodecay_params, weight_decay=0.0), dict(params=decay_params, weight_decay=training_args.weight_decay), dict(params=galore_params, weight_decay=training_args.weight_decay, **galore_kwargs), ] @@ -280,82 +289,90 @@ def _create_loraplus_optimizer( param_groups = [ dict(params=param_dict["lora_a"], **decay_args), dict(params=param_dict["lora_b"], lr=loraplus_lr, **decay_args), - dict(params=param_dict["lora_b_nodecay"], lr=loraplus_lr), + dict(params=param_dict["lora_b_nodecay"], lr=loraplus_lr, weight_decay=0.0), dict(params=param_dict["embedding"], lr=finetuning_args.loraplus_lr_embedding, **decay_args), ] optimizer = optim_class(param_groups, **optim_kwargs) logger.info("Using LoRA+ optimizer with loraplus lr ratio {:.2f}.".format(finetuning_args.loraplus_lr_ratio)) return optimizer + def _create_badam_optimizer( model: "PreTrainedModel", training_args: "Seq2SeqTrainingArguments", finetuning_args: "FinetuningArguments", ) -> "torch.optim.Optimizer": - - from transformers.trainer_pt_utils import get_parameter_names - decay_parameters = list(filter(lambda n: "bias" not in n, get_parameter_names(model, ALL_LAYERNORM_LAYERS))) - # filter out the embedding layers when using badam ratio mode - if finetuning_args.badam_mode == "ratio": - decay_parameters = list(filter(lambda n: "embed" not in n, decay_parameters)) # TODO: make it more general - optimizer_grouped_parameters = [ - { - "params": [p for n, p in model.named_parameters() if n in decay_parameters], - "weight_decay": training_args.weight_decay, - }, - { - "params": [p for n, p in model.named_parameters() if n not in decay_parameters], - "weight_decay": 0.0, - }, + decay_param_names = _get_decay_parameter_names(model) + if finetuning_args.badam_mode == "ratio": # filter out the embedding layers for ratio-wise badam + decay_param_names = [name for name in decay_param_names if name not in _get_embedding_names(model)] + + decay_params, nodecay_params = [], [] + for name, param in model.named_parameters(): + if param.requires_grad: + if name in decay_param_names: + decay_params.append(param) + else: + nodecay_params.append(param) + + optim_class, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args) + param_groups = [ + dict(params=nodecay_params, weight_decay=0.0), + dict(params=decay_params, weight_decay=training_args.weight_decay), ] - optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args) - - # create BlockOptimizer if finetuning_args.badam_mode == "layer": from badam import BlockOptimizer - base_optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) - optimizer = BlockOptimizer(base_optimizer=base_optimizer, - named_parameters_list=list(model.named_parameters()), - block_prefix_list=None, - switch_block_every=finetuning_args.switch_block_every, - start_block=finetuning_args.start_block, - switch_mode=finetuning_args.switch_mode, - verbose=finetuning_args.badam_verbose) - - logger.info(f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.switch_mode}, " - f"switch block every {finetuning_args.switch_block_every} steps, " - f"default start block is {finetuning_args.start_block}") - + + base_optimizer = optim_class(param_groups, **optim_kwargs) + optimizer = BlockOptimizer( + base_optimizer=base_optimizer, + named_parameters_list=list(model.named_parameters()), + block_prefix_list=None, + switch_block_every=finetuning_args.badam_switch_block_every, + start_block=finetuning_args.badam_start_block, + switch_mode=finetuning_args.badam_switch_mode, + verbose=finetuning_args.badam_verbose, + ) + logger.info( + f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, " + f"switch block every {finetuning_args.badam_switch_block_every} steps, " + f"default start block is {finetuning_args.badam_start_block}" + ) + elif finetuning_args.badam_mode == "ratio": - assert finetuning_args.badam_update_ratio > 0. from badam import BlockOptimizerRatio - optimizer = BlockOptimizerRatio(param_groups=optimizer_grouped_parameters, - named_parameters_list=list(model.named_parameters()), - update_ratio=finetuning_args.badam_update_ratio, - mask_mode=finetuning_args.badam_mask_mode, - verbose=finetuning_args.badam_verbose, - **optimizer_kwargs) - - logger.info(f"Using BAdam optimizer with ratio update, update ratio is {finetuning_args.badam_update_ratio}, " - f"mask mode is {finetuning_args.badam_mask_mode}") - + + assert finetuning_args.badam_update_ratio > 1e-6 + optimizer = BlockOptimizerRatio( + param_groups=param_groups, + named_parameters_list=list(model.named_parameters()), + update_ratio=finetuning_args.badam_update_ratio, + mask_mode=finetuning_args.badam_mask_mode, + verbose=finetuning_args.badam_verbose, + **optim_kwargs, + ) + logger.info( + f"Using BAdam optimizer with ratio-wise update, update ratio is {finetuning_args.badam_update_ratio}, " + f"mask mode is {finetuning_args.badam_mask_mode}" + ) + return optimizer + def create_custom_optimzer( model: "PreTrainedModel", training_args: "Seq2SeqTrainingArguments", finetuning_args: "FinetuningArguments", ) -> Optional["torch.optim.Optimizer"]: - if finetuning_args.use_badam: - return _create_badam_optimizer(model, training_args, finetuning_args) - if finetuning_args.use_galore: return _create_galore_optimizer(model, training_args, finetuning_args) if finetuning_args.loraplus_lr_ratio is not None: return _create_loraplus_optimizer(model, training_args, finetuning_args) + if finetuning_args.use_badam: + return _create_badam_optimizer(model, training_args, finetuning_args) + def create_custom_scheduler( training_args: "Seq2SeqTrainingArguments",