diff --git a/.gitignore b/.gitignore index 0355c666..2486e728 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,5 @@ cython_debug/ user.config saves/ cache/ +wandb +ds_badam_exp \ No newline at end of file diff --git a/examples/extras/badam/llama3_badam_sft.yaml b/examples/extras/badam/llama3_badam_sft.yaml new file mode 100644 index 00000000..f5adb220 --- /dev/null +++ b/examples/extras/badam/llama3_badam_sft.yaml @@ -0,0 +1,40 @@ +### model +model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct + +### method +stage: sft +do_train: true +finetuning_type: full +use_badam: true +badam_switch_mode: ascending +badam_switch_interval: 50 +badam_verbose: 2 + +### dataset +dataset: identity,alpaca_en_demo +template: llama3 +cutoff_len: 1024 +max_samples: 1000 +overwrite_cache: true +preprocessing_num_workers: 16 + +### output +output_dir: saves/llama3-8b/full/sft +logging_steps: 10 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true + +### train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 8 +learning_rate: 1.0e-6 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_ratio: 0.1 + +### eval +val_size: 0.1 +per_device_eval_batch_size: 1 +eval_strategy: steps +eval_steps: 500 diff --git a/examples/extras/badam/train_single_gpu.sh b/examples/extras/badam/train_single_gpu.sh new file mode 100644 index 00000000..8af79007 --- /dev/null +++ b/examples/extras/badam/train_single_gpu.sh @@ -0,0 +1,37 @@ +#!/bin/bash +export CUDA_VISIBLE_DEVICES=0 + +cd ../../.. + +llamafactory-cli train \ + --stage sft \ + --do_train True \ + --model_name_or_path meta-llama/Llama-2-13b-hf \ + --preprocessing_num_workers 16 \ + --finetuning_type full \ + --template default \ + --flash_attn auto \ + --dataset_dir data \ + --dataset alpaca_en_demo \ + --cutoff_len 1024 \ + --learning_rate 1e-6 \ + --num_train_epochs 3.0 \ + --max_samples 100000 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --lr_scheduler_type cosine \ + --max_grad_norm 1.0 \ + --logging_steps 5 \ + --save_steps 100 \ + --warmup_steps 0 \ + --optim adamw_torch \ + --packing False \ + --report_to none \ + --use_badam True \ + --output_dir saves/LLaMA2-13B/full/BAdam \ + --plot_loss True \ + --ddp_timeout 180000000 \ + --include_num_input_tokens_seen True \ + --badam_mode layer \ + --badam_switch_mode ascending \ + --badam_switch_interval 50 \ No newline at end of file diff --git a/examples/extras/badam/train_zero3.sh b/examples/extras/badam/train_zero3.sh new file mode 100644 index 00000000..3b182134 --- /dev/null +++ b/examples/extras/badam/train_zero3.sh @@ -0,0 +1,39 @@ +#!/bin/bash +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +cd ../../.. + +llamafactory-cli train \ + --stage sft \ + --do_train True \ + --model_name_or_path meta-llama/Llama-2-13b-hf \ + --preprocessing_num_workers 16 \ + --finetuning_type full \ + --template default \ + --flash_attn auto \ + --dataset_dir data \ + --dataset alpaca_en_demo \ + --cutoff_len 1024 \ + --learning_rate 1e-6 \ + --num_train_epochs 3.0 \ + --max_samples 100000 \ + --per_device_train_batch_size 8 \ + --gradient_accumulation_steps 2 \ + --lr_scheduler_type cosine \ + --max_grad_norm 1.0 \ + --logging_steps 5 \ + --save_steps 100 \ + --warmup_steps 0 \ + --optim adamw_torch \ + --packing False \ + --report_to none \ + --use_badam True \ + --output_dir saves/LLaMA2-13B/full/BAdam \ + --fp16 True \ + --plot_loss True \ + --ddp_timeout 180000000 \ + --include_num_input_tokens_seen True \ + --badam_mode layer \ + --badam_switch_mode ascending \ + --badam_switch_interval 50 \ + --deepspeed cache/ds_z3_config.json \ No newline at end of file diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py index a593bf45..f2ccd5e6 100644 --- a/src/llamafactory/hparams/parser.py +++ b/src/llamafactory/hparams/parser.py @@ -214,13 +214,15 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: if ( finetuning_args.use_badam - and finetuning_args.badam_mode == "layer" - and training_args.parallel_mode == ParallelMode.DISTRIBUTED + and training_args.parallel_mode.value == "distributed" ): - raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.") + if finetuning_args.badam_mode == "ratio": + raise ValueError("Ratio-wise BAdam does not yet support distributed training, use layer-wise BAdam: --badam_mode layer") + if finetuning_args.badam_mode == "layer" and (not is_deepspeed_zero3_enabled()): + raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage.") - if (finetuning_args.use_galore or finetuning_args.use_badam) and training_args.deepspeed is not None: - raise ValueError("GaLore and BAdam are incompatible with DeepSpeed yet.") + if (finetuning_args.use_galore) and training_args.deepspeed is not None: + raise ValueError("GaLore are incompatible with DeepSpeed yet.") if model_args.infer_backend == "vllm": raise ValueError("vLLM backend is only available for API, CLI and Web.") diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py index 9928d0bc..a3e0e961 100644 --- a/src/llamafactory/train/dpo/trainer.py +++ b/src/llamafactory/train/dpo/trainer.py @@ -96,9 +96,9 @@ class CustomDPOTrainer(DPOTrainer): self.save_model(os.path.join(self.args.output_dir, "pissa_init")) if finetuning_args.use_badam: - from badam import clip_grad_norm_for_sparse_tensor - - self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + from badam import clip_grad_norm_old_version, BAdamCallback + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) + self.callback_handler.add_callback(BAdamCallback) def create_optimizer(self) -> "torch.optim.Optimizer": if self.optimizer is None: diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py index 91d68975..0d50987f 100644 --- a/src/llamafactory/train/kto/trainer.py +++ b/src/llamafactory/train/kto/trainer.py @@ -91,9 +91,9 @@ class CustomKTOTrainer(KTOTrainer): self.ref_model.eval() if finetuning_args.use_badam: - from badam import clip_grad_norm_for_sparse_tensor - - self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + from badam import clip_grad_norm_old_version, BAdamCallback + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) + self.callback_handler.add_callback(BAdamCallback) def create_optimizer(self) -> "torch.optim.Optimizer": if self.optimizer is None: diff --git a/src/llamafactory/train/ppo/trainer.py b/src/llamafactory/train/ppo/trainer.py index 38f4c6c8..2d5d7ffc 100644 --- a/src/llamafactory/train/ppo/trainer.py +++ b/src/llamafactory/train/ppo/trainer.py @@ -166,9 +166,9 @@ class CustomPPOTrainer(PPOTrainer, Trainer): self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True) if finetuning_args.use_badam: - from badam import clip_grad_norm_for_sparse_tensor - - self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + from badam import clip_grad_norm_old_version, BAdamCallback + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) + self.callback_handler.add_callback(BAdamCallback) def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None: r""" diff --git a/src/llamafactory/train/pt/trainer.py b/src/llamafactory/train/pt/trainer.py index f9e04cb5..d3516b41 100644 --- a/src/llamafactory/train/pt/trainer.py +++ b/src/llamafactory/train/pt/trainer.py @@ -48,9 +48,9 @@ class CustomTrainer(Trainer): self.save_model(os.path.join(self.args.output_dir, "pissa_init")) if finetuning_args.use_badam: - from badam import clip_grad_norm_for_sparse_tensor - - self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + from badam import clip_grad_norm_old_version, BAdamCallback + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) + self.callback_handler.add_callback(BAdamCallback) def create_optimizer(self) -> "torch.optim.Optimizer": if self.optimizer is None: diff --git a/src/llamafactory/train/rm/trainer.py b/src/llamafactory/train/rm/trainer.py index 7f91e5f5..433251cf 100644 --- a/src/llamafactory/train/rm/trainer.py +++ b/src/llamafactory/train/rm/trainer.py @@ -72,9 +72,9 @@ class PairwiseTrainer(Trainer): self.processor = processor self.can_return_loss = True # override property to return eval_loss if finetuning_args.use_badam: - from badam import clip_grad_norm_for_sparse_tensor - - self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + from badam import clip_grad_norm_old_version, BAdamCallback + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) + self.callback_handler.add_callback(BAdamCallback) def create_optimizer(self) -> "torch.optim.Optimizer": if self.optimizer is None: diff --git a/src/llamafactory/train/sft/trainer.py b/src/llamafactory/train/sft/trainer.py index 921e49ab..45799b96 100644 --- a/src/llamafactory/train/sft/trainer.py +++ b/src/llamafactory/train/sft/trainer.py @@ -56,9 +56,9 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer): self.save_model(os.path.join(self.args.output_dir, "pissa_init")) if finetuning_args.use_badam: - from badam import clip_grad_norm_for_sparse_tensor - - self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + from badam import clip_grad_norm_old_version, BAdamCallback + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator) + self.callback_handler.add_callback(BAdamCallback) def create_optimizer(self) -> "torch.optim.Optimizer": if self.optimizer is None: diff --git a/src/llamafactory/train/trainer_utils.py b/src/llamafactory/train/trainer_utils.py index c1b90155..0206dcb6 100644 --- a/src/llamafactory/train/trainer_utils.py +++ b/src/llamafactory/train/trainer_utils.py @@ -372,6 +372,9 @@ def _create_badam_optimizer( dict(params=decay_params, weight_decay=training_args.weight_decay), ] + from transformers.integrations import is_deepspeed_zero3_enabled + ds_zero3_enabled = is_deepspeed_zero3_enabled() + if finetuning_args.badam_mode == "layer": from badam import BlockOptimizer @@ -384,6 +387,7 @@ def _create_badam_optimizer( start_block=finetuning_args.badam_start_block, switch_mode=finetuning_args.badam_switch_mode, verbose=finetuning_args.badam_verbose, + ds_zero3_enabled=ds_zero3_enabled ) logger.info( f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, " @@ -394,6 +398,7 @@ def _create_badam_optimizer( elif finetuning_args.badam_mode == "ratio": from badam import BlockOptimizerRatio + assert not ds_zero3_enabled, "BAdam with ratio-based update does not support Deepspeed ZeRO-3 yet, use layer-wise update instead: --badam_mode layer." assert finetuning_args.badam_update_ratio > 1e-6 optimizer = BlockOptimizerRatio( param_groups=param_groups, @@ -405,7 +410,7 @@ def _create_badam_optimizer( **optim_kwargs, ) logger.info( - f"Using BAdam optimizer with ratio-wise update, update ratio is {finetuning_args.badam_update_ratio}, " + f"Using BAdam optimizer with ratio-based update, update ratio is {finetuning_args.badam_update_ratio}, " f"mask mode is {finetuning_args.badam_mask_mode}" )