From cb63b32986c43f97994211ec34dc5928fc3bb9d7 Mon Sep 17 00:00:00 2001 From: hiyouga <467089858@qq.com> Date: Sun, 26 May 2024 23:46:33 +0800 Subject: [PATCH] support SimPO #3900 --- README.md | 7 +- README_zh.md | 7 +- examples/README.md | 8 +- examples/README_zh.md | 8 +- examples/lora_single_gpu/llama3_lora_dpo.yaml | 2 +- examples/lora_single_gpu/llama3_lora_kto.yaml | 1 - .../lora_single_gpu/llama3_lora_orpo.yaml | 38 ----- src/llamafactory/extras/constants.py | 1 - src/llamafactory/hparams/finetuning_args.py | 40 +++--- src/llamafactory/train/dpo/trainer.py | 128 +++++++++++++---- src/llamafactory/train/dpo/workflow.py | 11 +- src/llamafactory/train/kto/trainer.py | 4 +- src/llamafactory/train/orpo/__init__.py | 4 - src/llamafactory/train/orpo/trainer.py | 133 ------------------ src/llamafactory/train/orpo/workflow.py | 69 --------- src/llamafactory/train/tuner.py | 4 +- src/llamafactory/train/utils.py | 4 +- src/llamafactory/webui/components/train.py | 2 +- src/llamafactory/webui/runner.py | 13 +- 19 files changed, 145 insertions(+), 339 deletions(-) delete mode 100644 examples/lora_single_gpu/llama3_lora_orpo.yaml delete mode 100644 src/llamafactory/train/orpo/__init__.py delete mode 100644 src/llamafactory/train/orpo/trainer.py delete mode 100644 src/llamafactory/train/orpo/workflow.py diff --git a/README.md b/README.md index fcc96882..c6b72443 100644 --- a/README.md +++ b/README.md @@ -69,14 +69,16 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ ## Changelog +[24/05/26] We supported **[SimPO](https://arxiv.org/abs/2405.14734)** algorithm for preference learning. See [examples](examples/README.md) for usage. + [24/05/20] We supported fine-tuning the **PaliGemma** series models. Note that the PaliGemma models are pre-trained models, you need to fine-tune them with `gemma` template for chat completion. [24/05/18] We supported **[KTO](https://arxiv.org/abs/2402.01306)** algorithm for preference learning. See [examples](examples/README.md) for usage. -[24/05/14] We supported training and inference on the Ascend NPU devices. Check [installation](#installation) section for details. -
Full Changelog +[24/05/14] We supported training and inference on the Ascend NPU devices. Check [installation](#installation) section for details. + [24/04/26] We supported fine-tuning the **LLaVA-1.5** multimodal LLMs. See [examples](examples/README.md) for usage. [24/04/22] We provided a **[Colab notebook](https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing)** for fine-tuning the Llama-3 model on a free T4 GPU. Two Llama-3-derived models fine-tuned using LLaMA Factory are available at Hugging Face, check [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) and [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese) for details. @@ -193,6 +195,7 @@ You also can add a custom chat template to [template.py](src/llamafactory/data/t | DPO Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | KTO Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | ORPO Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| SimPO Training | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | ## Provided Datasets diff --git a/README_zh.md b/README_zh.md index 2e0b4f34..5669e1ae 100644 --- a/README_zh.md +++ b/README_zh.md @@ -69,14 +69,16 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd ## 更新日志 +[24/05/26] 我们支持了 **[SimPO](https://arxiv.org/abs/2405.14734)** 偏好对齐算法。详细用法请参照 [examples](examples/README_zh.md)。 + [24/05/20] 我们支持了 **PaliGemma** 系列模型的微调。注意 PaliGemma 是预训练模型,你需要使用 `gemma` 模板进行微调使其获得对话能力。 [24/05/18] 我们支持了 **[KTO](https://arxiv.org/abs/2402.01306)** 偏好对齐算法。详细用法请参照 [examples](examples/README_zh.md)。 -[24/05/14] 我们支持了昇腾 NPU 设备的训练和推理。详情请查阅[安装](#安装-llama-factory)部分。 -
展开日志 +[24/05/14] 我们支持了昇腾 NPU 设备的训练和推理。详情请查阅[安装](#安装-llama-factory)部分。 + [24/04/26] 我们支持了多模态模型 **LLaVA-1.5** 的微调。详细用法请参照 [examples](examples/README_zh.md)。 [24/04/22] 我们提供了在免费 T4 GPU 上微调 Llama-3 模型的 **[Colab 笔记本](https://colab.research.google.com/drive/1d5KQtbemerlSDSxZIfAaWXhKr30QypiK?usp=sharing)**。Hugging Face 社区公开了两个利用 LLaMA Factory 微调的 Llama-3 模型,详情请见 [Llama3-8B-Chinese-Chat](https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat) 和 [Llama3-Chinese](https://huggingface.co/zhichen/Llama3-Chinese)。 @@ -193,6 +195,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd | DPO 训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | KTO 训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | | ORPO 训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | +| SimPO 训练 | :white_check_mark: | :white_check_mark: | :white_check_mark: | :white_check_mark: | ## 数据集 diff --git a/examples/README.md b/examples/README.md index 94066b5d..9c6d5fb0 100644 --- a/examples/README.md +++ b/examples/README.md @@ -47,7 +47,7 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_ppo.yaml ``` -#### DPO Training +#### DPO/ORPO/SimPO Training ```bash CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_dpo.yaml @@ -59,12 +59,6 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_kto.yaml ``` -#### ORPO Training - -```bash -CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_orpo.yaml -``` - #### Preprocess Dataset It is useful for large dataset, use `tokenized_path` in config to load the preprocessed dataset. diff --git a/examples/README_zh.md b/examples/README_zh.md index 77e9c416..0ff33398 100644 --- a/examples/README_zh.md +++ b/examples/README_zh.md @@ -47,7 +47,7 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_ppo.yaml ``` -#### DPO 训练 +#### DPO/ORPO/SimPO 训练 ```bash CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_dpo.yaml @@ -59,12 +59,6 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lo CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_kto.yaml ``` -#### ORPO 训练 - -```bash -CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/lora_single_gpu/llama3_lora_orpo.yaml -``` - #### 预处理数据集 对于大数据集有帮助,在配置中使用 `tokenized_path` 以加载预处理后的数据集。 diff --git a/examples/lora_single_gpu/llama3_lora_dpo.yaml b/examples/lora_single_gpu/llama3_lora_dpo.yaml index 36d64923..958be1b5 100644 --- a/examples/lora_single_gpu/llama3_lora_dpo.yaml +++ b/examples/lora_single_gpu/llama3_lora_dpo.yaml @@ -6,7 +6,7 @@ stage: dpo do_train: true finetuning_type: lora lora_target: q_proj,v_proj -dpo_ftx: 1.0 +pref_loss: sigmoid # [sigmoid (dpo), orpo, simpo] ### dataset dataset: dpo_en_demo diff --git a/examples/lora_single_gpu/llama3_lora_kto.yaml b/examples/lora_single_gpu/llama3_lora_kto.yaml index 285289f9..4405aaec 100644 --- a/examples/lora_single_gpu/llama3_lora_kto.yaml +++ b/examples/lora_single_gpu/llama3_lora_kto.yaml @@ -6,7 +6,6 @@ stage: kto do_train: true finetuning_type: lora lora_target: q_proj,v_proj -kto_ftx: 0.1 ### dataset dataset: kto_en_demo diff --git a/examples/lora_single_gpu/llama3_lora_orpo.yaml b/examples/lora_single_gpu/llama3_lora_orpo.yaml deleted file mode 100644 index 880ccb1c..00000000 --- a/examples/lora_single_gpu/llama3_lora_orpo.yaml +++ /dev/null @@ -1,38 +0,0 @@ -### model -model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct - -### method -stage: orpo -do_train: true -finetuning_type: lora -lora_target: q_proj,v_proj - -### dataset -dataset: dpo_en_demo -template: llama3 -cutoff_len: 1024 -max_samples: 1000 -overwrite_cache: true -preprocessing_num_workers: 16 - -### output -output_dir: saves/llama3-8b/lora/orpo -logging_steps: 10 -save_steps: 500 -plot_loss: true -overwrite_output_dir: true - -### train -per_device_train_batch_size: 1 -gradient_accumulation_steps: 8 -learning_rate: 0.000005 -num_train_epochs: 3.0 -lr_scheduler_type: cosine -warmup_steps: 0.1 -fp16: true - -### eval -val_size: 0.1 -per_device_eval_batch_size: 1 -evaluation_strategy: steps -eval_steps: 500 diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index ae088e66..09c54899 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -48,7 +48,6 @@ TRAINING_STAGES = { "PPO": "ppo", "DPO": "dpo", "KTO": "kto", - "ORPO": "orpo", "Pre-Training": "pt", } diff --git a/src/llamafactory/hparams/finetuning_args.py b/src/llamafactory/hparams/finetuning_args.py index b84e238a..05b246ae 100644 --- a/src/llamafactory/hparams/finetuning_args.py +++ b/src/llamafactory/hparams/finetuning_args.py @@ -114,14 +114,18 @@ class LoraArguments: @dataclass class RLHFArguments: r""" - Arguments pertaining to the PPO and DPO training. + Arguments pertaining to the PPO, DPO and KTO training. """ - dpo_beta: float = field( + pref_beta: float = field( default=0.1, - metadata={"help": "The beta parameter for the DPO loss."}, + metadata={"help": "The beta parameter in the preference loss."}, ) - dpo_loss: Literal["sigmoid", "hinge", "ipo", "kto_pair"] = field( + pref_ftx: float = field( + default=0.0, + metadata={"help": "The supervised fine-tuning loss coefficient in DPO training."}, + ) + pref_loss: Literal["sigmoid", "hinge", "ipo", "kto_pair", "orpo", "simpo"] = field( default="sigmoid", metadata={"help": "The type of DPO loss to use."}, ) @@ -129,14 +133,6 @@ class RLHFArguments: default=0.0, metadata={"help": "The robust DPO label smoothing parameter in cDPO that should be between 0 and 0.5."}, ) - dpo_ftx: float = field( - default=0.0, - metadata={"help": "The supervised fine-tuning loss coefficient in DPO training."}, - ) - kto_beta: float = field( - default=0.1, - metadata={"help": "The beta parameter for the KTO loss."}, - ) kto_chosen_weight: float = field( default=1.0, metadata={"help": "The weight factor of the desirable losses in KTO training."}, @@ -145,13 +141,9 @@ class RLHFArguments: default=1.0, metadata={"help": "The weight factor of the undesirable losses in KTO training."}, ) - kto_ftx: float = field( - default=0.0, - metadata={"help": "The supervised fine-tuning loss coefficient in KTO training."}, - ) - orpo_beta: float = field( - default=0.1, - metadata={"help": "The beta (lambda) parameter in the ORPO loss representing the weight of the SFT loss."}, + simpo_gamma: float = field( + default=0.5, + metadata={"help": "The target reward margin term in SimPO loss."}, ) ppo_buffer_size: int = field( default=1, @@ -307,7 +299,7 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA default=False, metadata={"help": "Whether or not to train model in purely bf16 precision (without AMP)."}, ) - stage: Literal["pt", "sft", "rm", "ppo", "dpo", "kto", "orpo"] = field( + stage: Literal["pt", "sft", "rm", "ppo", "dpo", "kto"] = field( default="sft", metadata={"help": "Which stage will be performed in training."}, ) @@ -341,20 +333,22 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA assert self.ref_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization." assert self.reward_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization." + self.use_ref_model = self.pref_loss not in ["orpo", "simpo"] + if self.stage == "ppo" and self.reward_model is None: raise ValueError("`reward_model` is necessary for PPO training.") if self.stage == "ppo" and self.reward_model_type == "lora" and self.finetuning_type != "lora": raise ValueError("`reward_model_type` cannot be lora for Freeze/Full PPO training.") - if self.stage == "dpo" and self.dpo_loss != "sigmoid" and self.dpo_label_smoothing > 1e-6: + if self.stage == "dpo" and self.pref_loss != "sigmoid" and self.dpo_label_smoothing > 1e-6: raise ValueError("`dpo_label_smoothing` is only valid for sigmoid loss function.") if self.use_llama_pro and self.finetuning_type == "full": raise ValueError("`use_llama_pro` is only valid for the Freeze or LoRA training.") - if self.use_galore and self.finetuning_type == "lora": - raise ValueError("Cannot use LoRA with GaLore together.") + if self.finetuning_type == "lora" and (self.use_galore or self.use_badam): + raise ValueError("Cannot use LoRA with GaLore or BAdam together.") if self.use_galore and self.use_badam: raise ValueError("Cannot use GaLore with BAdam together.") diff --git a/src/llamafactory/train/dpo/trainer.py b/src/llamafactory/train/dpo/trainer.py index 23aa2c8a..f3c2443c 100644 --- a/src/llamafactory/train/dpo/trainer.py +++ b/src/llamafactory/train/dpo/trainer.py @@ -4,6 +4,7 @@ from types import MethodType from typing import TYPE_CHECKING, Dict, Literal, Optional, Tuple, Union import torch +import torch.nn.functional as F from transformers import Trainer from trl import DPOTrainer from trl.trainer.utils import disable_dropout_in_model @@ -50,10 +51,11 @@ class CustomDPOTrainer(DPOTrainer): self._stored_metrics = defaultdict(lambda: defaultdict(list)) # dpo hyperparams - self.beta = finetuning_args.dpo_beta + self.beta = finetuning_args.pref_beta + self.loss_type = finetuning_args.pref_loss + self.ftx_gamma = finetuning_args.pref_ftx self.label_smoothing = finetuning_args.dpo_label_smoothing - self.loss_type = finetuning_args.dpo_loss - self.ftx_gamma = finetuning_args.dpo_ftx + self.simpo_gamma = finetuning_args.simpo_gamma Trainer.__init__(self, model=model, **kwargs) if not hasattr(self, "accelerator"): @@ -90,15 +92,66 @@ class CustomDPOTrainer(DPOTrainer): output_dir = output_dir if output_dir is not None else self.args.output_dir getattr(self.processor, "image_processor").save_pretrained(output_dir) - def sft_loss(self, chosen_logits: "torch.FloatTensor", chosen_labels: "torch.LongTensor") -> "torch.Tensor": + def sft_loss(self, batch: Dict[str, "torch.Tensor"], chosen_logits: "torch.FloatTensor") -> "torch.Tensor": r""" Computes supervised cross-entropy loss of given labels under the given logits. Returns: A tensor of shape (batch_size,) containing the cross-entropy loss of each samples. """ - all_logps = self.get_batch_logps(chosen_logits, chosen_labels, average_log_prob=True) - return -all_logps + batch_size = batch["input_ids"].size(0) // 2 + chosen_labels, _ = batch["labels"].split(batch_size, dim=0) + chosen_logps = self.get_batch_logps(chosen_logits, chosen_labels, average_log_prob=True) + return -chosen_logps + + def odds_ratio_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor": + r""" + Computes ORPO's odds ratio (OR) loss for batched log probabilities of the policy model. + """ + log_odds = (chosen_logps - rejected_logps) - ( + torch.log1p(-torch.exp(chosen_logps)) - torch.log1p(-torch.exp(rejected_logps)) + ) + sft_loss = -chosen_logps + odds_ratio_loss = -F.logsigmoid(log_odds) + orpo_loss = sft_loss + self.beta * odds_ratio_loss + return orpo_loss + + def simpo_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor": + r""" + Computes SimPO loss for batched log probabilities of the policy model. + """ + pi_logratios = chosen_logps - rejected_logps + gamma_logratios = self.simpo_gamma / self.beta + logits = pi_logratios - gamma_logratios + simpo_loss = -F.logsigmoid(self.beta * logits) + return simpo_loss + + def compute_preference_loss( + self, + policy_chosen_logps: "torch.Tensor", + policy_rejected_logps: "torch.Tensor", + reference_chosen_logps: Optional["torch.Tensor"], + reference_rejected_logps: Optional["torch.Tensor"], + ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"]: + r""" + Computes loss for preference learning. + """ + if not self.finetuning_args.use_ref_model: + if self.loss_type == "orpo": + losses = self.odds_ratio_loss(policy_chosen_logps, policy_rejected_logps) + elif self.loss_type == "simpo": + losses = self.simpo_loss(policy_chosen_logps, policy_rejected_logps) + else: + raise NotImplementedError("Unknown loss type: {}.".format(self.loss_type)) + + chosen_rewards = self.beta * policy_chosen_logps.to(self.accelerator.device).detach() + rejected_rewards = self.beta * policy_rejected_logps.to(self.accelerator.device).detach() + else: + losses, chosen_rewards, rejected_rewards = self.dpo_loss( + policy_chosen_logps, policy_rejected_logps, reference_chosen_logps, reference_rejected_logps + ) + + return losses, chosen_rewards, rejected_rewards def concatenated_forward( self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"] @@ -108,13 +161,15 @@ class CustomDPOTrainer(DPOTrainer): Otherwise the average log probabilities. """ - batch_copied = {k: v.detach().clone() for k, v in batch.items()} # avoid error - all_logits: "torch.Tensor" = model(**batch_copied, return_dict=True, use_cache=False).logits.to(torch.float32) + if self.finetuning_args.use_ref_model: + batch = {k: v.detach().clone() for k, v in batch.items()} # avoid error + + all_logits: "torch.Tensor" = model(**batch, return_dict=True, use_cache=False).logits.to(torch.float32) all_logps = self.get_batch_logps( logits=all_logits, - labels=batch_copied["labels"], - average_log_prob=(self.loss_type == "ipo"), + labels=batch["labels"], + average_log_prob=(self.loss_type in ["ipo", "orpo", "simpo"]), is_encoder_decoder=self.is_encoder_decoder, label_pad_token_id=self.label_pad_token_id, ) @@ -123,6 +178,32 @@ class CustomDPOTrainer(DPOTrainer): chosen_logits, rejected_logits = all_logits.split(batch_size, dim=0) return chosen_logps, rejected_logps, chosen_logits, rejected_logits + def compute_reference_log_probs( + self, batch: Dict[str, "torch.Tensor"] + ) -> Tuple[Optional["torch.Tensor"], Optional["torch.Tensor"]]: + r""" + Computes log probabilities of the reference model. + """ + if not self.finetuning_args.use_ref_model: + return None, None + + if self.ref_model is None: + ref_model = self.model + ref_context = self.accelerator.unwrap_model(self.model).disable_adapter() + else: + ref_model = self.ref_model + ref_context = nullcontext() + + with torch.no_grad(), ref_context: + ( + reference_chosen_logps, + reference_rejected_logps, + _, + _, + ) = self.concatenated_forward(ref_model, batch) + + return reference_chosen_logps, reference_rejected_logps + def get_batch_loss_metrics( self, model: "PreTrainedModel", @@ -140,32 +221,16 @@ class CustomDPOTrainer(DPOTrainer): policy_rejected_logits, ) = self.concatenated_forward(model, batch) - with torch.no_grad(): - if self.ref_model is None: - ref_model = self.model - ref_context = self.accelerator.unwrap_model(self.model).disable_adapter() - else: - ref_model = self.ref_model - ref_context = nullcontext() - - with ref_context: - ( - reference_chosen_logps, - reference_rejected_logps, - _, - _, - ) = self.concatenated_forward(ref_model, batch) - - losses, chosen_rewards, rejected_rewards = self.dpo_loss( + reference_chosen_logps, reference_rejected_logps = self.compute_reference_log_probs(batch) + losses, chosen_rewards, rejected_rewards = self.compute_preference_loss( policy_chosen_logps, policy_rejected_logps, reference_chosen_logps, reference_rejected_logps, ) + sft_loss = self.sft_loss(batch, policy_chosen_logits) # compute chosen_logps with masks if self.ftx_gamma > 1e-6: - batch_size = batch["input_ids"].size(0) // 2 - chosen_labels, _ = batch["labels"].split(batch_size, dim=0) - losses += self.ftx_gamma * self.sft_loss(policy_chosen_logits, chosen_labels) + losses += self.ftx_gamma * sft_loss reward_accuracies = (chosen_rewards > rejected_rewards).float() @@ -178,5 +243,8 @@ class CustomDPOTrainer(DPOTrainer): metrics["{}logps/chosen".format(prefix)] = policy_chosen_logps.detach().mean().cpu() metrics["{}logits/rejected".format(prefix)] = policy_rejected_logits.detach().mean().cpu() metrics["{}logits/chosen".format(prefix)] = policy_chosen_logits.detach().mean().cpu() + if self.loss_type == "orpo": + metrics["{}sft_loss".format(prefix)] = sft_loss.detach().mean().cpu() + metrics["{}odds_ratio_loss".format(prefix)] = ((losses - sft_loss) / self.beta).detach().mean().cpu() return losses.mean(), metrics diff --git a/src/llamafactory/train/dpo/workflow.py b/src/llamafactory/train/dpo/workflow.py index 8ac4952a..61a3e2f0 100644 --- a/src/llamafactory/train/dpo/workflow.py +++ b/src/llamafactory/train/dpo/workflow.py @@ -36,10 +36,13 @@ def run_dpo( ) # Create reference model - if finetuning_args.ref_model is None and (not training_args.do_train): # use the model itself - ref_model = model + if finetuning_args.use_ref_model: + if finetuning_args.ref_model is None and (not training_args.do_train): # use the model itself + ref_model = model + else: + ref_model = create_ref_model(model_args, finetuning_args) else: - ref_model = create_ref_model(model_args, finetuning_args) + ref_model = None # Update arguments training_args.remove_unused_columns = False # important for pairwise dataset @@ -69,7 +72,7 @@ def run_dpo( # Evaluation if training_args.do_eval: metrics = trainer.evaluate(metric_key_prefix="eval") - if id(model) == id(ref_model): # unable to compute rewards without a reference model + if id(model) == id(ref_model): # unable to compute rewards if reference model is the model itself remove_keys = [key for key in metrics.keys() if "rewards" in key] for key in remove_keys: metrics.pop(key) diff --git a/src/llamafactory/train/kto/trainer.py b/src/llamafactory/train/kto/trainer.py index b0e42406..096fd935 100644 --- a/src/llamafactory/train/kto/trainer.py +++ b/src/llamafactory/train/kto/trainer.py @@ -50,10 +50,10 @@ class CustomKTOTrainer(KTOTrainer): self._stored_metrics = defaultdict(lambda: defaultdict(list)) # kto hyperparams - self.beta = finetuning_args.kto_beta + self.beta = finetuning_args.pref_beta self.desirable_weight = finetuning_args.kto_chosen_weight self.undesirable_weight = finetuning_args.kto_rejected_weight - self.ftx_gamma = finetuning_args.kto_ftx + self.ftx_gamma = finetuning_args.pref_ftx Trainer.__init__(self, model=model, **kwargs) if not hasattr(self, "accelerator"): diff --git a/src/llamafactory/train/orpo/__init__.py b/src/llamafactory/train/orpo/__init__.py deleted file mode 100644 index e79d5ea3..00000000 --- a/src/llamafactory/train/orpo/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .workflow import run_orpo - - -__all__ = ["run_orpo"] diff --git a/src/llamafactory/train/orpo/trainer.py b/src/llamafactory/train/orpo/trainer.py deleted file mode 100644 index 7cfdb429..00000000 --- a/src/llamafactory/train/orpo/trainer.py +++ /dev/null @@ -1,133 +0,0 @@ -from collections import defaultdict -from types import MethodType -from typing import TYPE_CHECKING, Dict, Literal, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -from transformers import Trainer -from trl import DPOTrainer -from trl.trainer.utils import disable_dropout_in_model - -from ...extras.constants import IGNORE_INDEX -from ..utils import create_custom_optimzer, create_custom_scheduler - - -if TYPE_CHECKING: - from transformers import PreTrainedModel, ProcessorMixin - - from ...hparams import FinetuningArguments - - -class CustomORPOTrainer(DPOTrainer): - def __init__( - self, - model: Union["PreTrainedModel", "torch.nn.Module"], - finetuning_args: "FinetuningArguments", - processor: Optional["ProcessorMixin"], - disable_dropout: bool = True, - **kwargs, - ): - if disable_dropout: - disable_dropout_in_model(model) - - self.finetuning_args = finetuning_args - self.processor = processor - self.reference_free = False - self.use_dpo_data_collator = True # hack to avoid warning - self.generate_during_eval = False # disable at evaluation - self.label_pad_token_id = IGNORE_INDEX - self.padding_value = 0 - self.is_encoder_decoder = model.config.is_encoder_decoder - self.precompute_ref_log_probs = False - self._precomputed_train_ref_log_probs = False - self._precomputed_eval_ref_log_probs = False - self._peft_has_been_casted_to_bf16 = False - - self.beta = finetuning_args.orpo_beta - self._stored_metrics = defaultdict(lambda: defaultdict(list)) - - Trainer.__init__(self, model=model, **kwargs) - if finetuning_args.use_badam: - from badam import clip_grad_norm_for_sparse_tensor - - self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) - - def create_optimizer(self) -> "torch.optim.Optimizer": - if self.optimizer is None: - self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args) - return super().create_optimizer() - - def create_scheduler( - self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None - ) -> "torch.optim.lr_scheduler.LRScheduler": - create_custom_scheduler(self.args, num_training_steps, optimizer) - return super().create_scheduler(num_training_steps, optimizer) - - def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None: - super()._save(output_dir, state_dict) - if self.processor is not None: - output_dir = output_dir if output_dir is not None else self.args.output_dir - getattr(self.processor, "image_processor").save_pretrained(output_dir) - - def odds_ratio_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor": - r""" - Computes ORPO's odds ratio (OR) loss. - """ - log_odds = (chosen_logps - rejected_logps) - ( - torch.log1p(-torch.exp(chosen_logps)) - torch.log1p(-torch.exp(rejected_logps)) - ) - odds_ratio_loss = -F.logsigmoid(log_odds) - return odds_ratio_loss - - def concatenated_forward( - self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"] - ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]: - r""" - Computes the average log probabilities of the labels under the given logits. - """ - all_logits: "torch.Tensor" = model(**batch, return_dict=True, use_cache=False).logits.to(torch.float32) - - all_logps = self.get_batch_logps( - logits=all_logits, - labels=batch["labels"], - average_log_prob=True, - is_encoder_decoder=self.is_encoder_decoder, - label_pad_token_id=self.label_pad_token_id, - ) - batch_size = batch["input_ids"].size(0) // 2 - chosen_logps, rejected_logps = all_logps.split(batch_size, dim=0) - chosen_logits, rejected_logits = all_logits.split(batch_size, dim=0) - return chosen_logps, rejected_logps, chosen_logits, rejected_logits - - def get_batch_loss_metrics( - self, - model: "PreTrainedModel", - batch: Dict[str, "torch.Tensor"], - train_eval: Literal["train", "eval"] = "train", - ) -> Tuple["torch.Tensor", Dict[str, "torch.Tensor"]]: - r""" - Computes the ORPO loss and other metrics for the given batch of inputs for train or test. - """ - metrics = {} - chosen_logps, rejected_logps, chosen_logits, rejected_logits = self.concatenated_forward(model, batch) - sft_loss = -chosen_logps - odds_ratio_loss = self.odds_ratio_loss(chosen_logps, rejected_logps) - batch_loss = (sft_loss + self.beta * odds_ratio_loss).mean() - - chosen_rewards = self.beta * chosen_logps.detach() - rejected_rewards = self.beta * rejected_logps.detach() - reward_accuracies = (chosen_rewards > rejected_rewards).float() - - prefix = "eval_" if train_eval == "eval" else "" - metrics["{}rewards/chosen".format(prefix)] = chosen_rewards.mean().cpu() - metrics["{}rewards/rejected".format(prefix)] = rejected_rewards.mean().cpu() - metrics["{}rewards/accuracies".format(prefix)] = reward_accuracies.mean().cpu() - metrics["{}rewards/margins".format(prefix)] = (chosen_rewards - rejected_rewards).mean().cpu() - metrics["{}logps/rejected".format(prefix)] = rejected_logps.detach().mean().cpu() - metrics["{}logps/chosen".format(prefix)] = chosen_logps.detach().mean().cpu() - metrics["{}logits/rejected".format(prefix)] = rejected_logits.detach().mean().cpu() - metrics["{}logits/chosen".format(prefix)] = chosen_logits.detach().mean().cpu() - metrics["{}sft_loss".format(prefix)] = sft_loss.detach().mean().cpu() - metrics["{}odds_ratio_loss".format(prefix)] = odds_ratio_loss.detach().mean().cpu() - - return batch_loss, metrics diff --git a/src/llamafactory/train/orpo/workflow.py b/src/llamafactory/train/orpo/workflow.py deleted file mode 100644 index 6ea18dae..00000000 --- a/src/llamafactory/train/orpo/workflow.py +++ /dev/null @@ -1,69 +0,0 @@ -# Inspired by: https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py - -from typing import TYPE_CHECKING, List, Optional - -from ...data import PairwiseDataCollatorWithPadding, get_dataset, split_dataset -from ...extras.constants import IGNORE_INDEX -from ...extras.ploting import plot_loss -from ...hparams import ModelArguments -from ...model import load_model, load_tokenizer -from ..utils import create_modelcard_and_push -from .trainer import CustomORPOTrainer - - -if TYPE_CHECKING: - from transformers import Seq2SeqTrainingArguments, TrainerCallback - - from ...hparams import DataArguments, FinetuningArguments - - -def run_orpo( - model_args: "ModelArguments", - data_args: "DataArguments", - training_args: "Seq2SeqTrainingArguments", - finetuning_args: "FinetuningArguments", - callbacks: Optional[List["TrainerCallback"]] = None, -): - tokenizer_module = load_tokenizer(model_args) - tokenizer = tokenizer_module["tokenizer"] - dataset = get_dataset(model_args, data_args, training_args, stage="rm", **tokenizer_module) - model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) - - data_collator = PairwiseDataCollatorWithPadding( - tokenizer=tokenizer, - pad_to_multiple_of=8, - label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id, - ) - - # Update arguments - training_args.remove_unused_columns = False # important for pairwise dataset - - # Initialize our Trainer - trainer = CustomORPOTrainer( - model=model, - args=training_args, - finetuning_args=finetuning_args, - data_collator=data_collator, - callbacks=callbacks, - **tokenizer_module, - **split_dataset(dataset, data_args, training_args), - ) - - # Training - if training_args.do_train: - train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) - trainer.save_model() - trainer.log_metrics("train", train_result.metrics) - trainer.save_metrics("train", train_result.metrics) - trainer.save_state() - if trainer.is_world_process_zero() and finetuning_args.plot_loss: - plot_loss(training_args.output_dir, keys=["loss", "eval_loss", "rewards/accuracies", "sft_loss"]) - - # Evaluation - if training_args.do_eval: - metrics = trainer.evaluate(metric_key_prefix="eval") - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - # Create model card - create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) diff --git a/src/llamafactory/train/tuner.py b/src/llamafactory/train/tuner.py index fadbb14a..eed875e9 100644 --- a/src/llamafactory/train/tuner.py +++ b/src/llamafactory/train/tuner.py @@ -10,7 +10,6 @@ from ..hparams import get_infer_args, get_train_args from ..model import load_model, load_tokenizer from .dpo import run_dpo from .kto import run_kto -from .orpo import run_orpo from .ppo import run_ppo from .pt import run_pt from .rm import run_rm @@ -40,8 +39,6 @@ def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: List["TrainerCallb run_dpo(model_args, data_args, training_args, finetuning_args, callbacks) elif finetuning_args.stage == "kto": run_kto(model_args, data_args, training_args, finetuning_args, callbacks) - elif finetuning_args.stage == "orpo": - run_orpo(model_args, data_args, training_args, finetuning_args, callbacks) else: raise ValueError("Unknown task.") @@ -100,5 +97,6 @@ def export_model(args: Optional[Dict[str, Any]] = None) -> None: getattr(processor, "image_processor").push_to_hub( model_args.export_hub_model_id, token=model_args.hf_hub_token ) + except Exception: logger.warning("Cannot save tokenizer, please copy the files manually.") diff --git a/src/llamafactory/train/utils.py b/src/llamafactory/train/utils.py index 21dac461..23834f2d 100644 --- a/src/llamafactory/train/utils.py +++ b/src/llamafactory/train/utils.py @@ -90,7 +90,7 @@ def create_ref_model( ) ) ref_model_args = ModelArguments(**ref_model_args_dict) - ref_finetuning_args = FinetuningArguments(finetuning_type="lora") + ref_finetuning_args = FinetuningArguments() tokenizer = load_tokenizer(ref_model_args)["tokenizer"] ref_model = load_model( tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead @@ -146,7 +146,7 @@ def create_reward_model( ) ) reward_model_args = ModelArguments(**reward_model_args_dict) - reward_finetuning_args = FinetuningArguments(finetuning_type="lora") + reward_finetuning_args = FinetuningArguments() tokenizer = load_tokenizer(reward_model_args)["tokenizer"] reward_model = load_model( tokenizer, reward_model_args, reward_finetuning_args, is_trainable=False, add_valuehead=True diff --git a/src/llamafactory/webui/components/train.py b/src/llamafactory/webui/components/train.py index 9b48c89a..d399106f 100644 --- a/src/llamafactory/webui/components/train.py +++ b/src/llamafactory/webui/components/train.py @@ -186,7 +186,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]: with gr.Row(): pref_beta = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.01) pref_ftx = gr.Slider(minimum=0, maximum=10, value=0, step=0.01) - pref_loss = gr.Dropdown(choices=["sigmoid", "hinge", "ipo", "kto_pair"], value="sigmoid") + pref_loss = gr.Dropdown(choices=["sigmoid", "hinge", "ipo", "kto_pair", "orpo", "simpo"], value="sigmoid") reward_model = gr.Dropdown(multiselect=True, allow_custom_value=True) with gr.Column(): ppo_score_norm = gr.Checkbox() diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py index 24046e62..57595a08 100644 --- a/src/llamafactory/webui/runner.py +++ b/src/llamafactory/webui/runner.py @@ -179,15 +179,10 @@ class Runner: args["ppo_whiten_rewards"] = get("train.ppo_whiten_rewards") args["top_k"] = 0 args["top_p"] = 0.9 - elif args["stage"] == "dpo": - args["dpo_beta"] = get("train.pref_beta") - args["dpo_ftx"] = get("train.pref_ftx") - args["dpo_loss"] = get("train.pref_loss") - elif args["stage"] == "kto": - args["kto_beta"] = get("train.pref_beta") - args["kto_ftx"] = get("train.pref_ftx") - elif args["stage"] == "orpo": - args["orpo_beta"] = get("train.pref_beta") + elif args["stage"] in ["dpo", "kto"]: + args["pref_beta"] = get("train.pref_beta") + args["pref_ftx"] = get("train.pref_ftx") + args["pref_loss"] = get("train.pref_loss") # galore config if args["use_galore"]: