diff --git a/src/llmtuner/model/__init__.py b/src/llmtuner/model/__init__.py index f6be60d8..db81e1dc 100644 --- a/src/llmtuner/model/__init__.py +++ b/src/llmtuner/model/__init__.py @@ -1,11 +1,10 @@ -from .loader import load_config, load_model, load_tokenizer, load_processor +from .loader import load_config, load_model, load_tokenizer from .utils.misc import find_all_linear_modules, load_valuehead_params __all__ = [ "load_config", "load_model", "load_tokenizer", - "load_processor", "load_valuehead_params", "find_all_linear_modules", ] diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py index 18b0cf79..99ad9adc 100644 --- a/src/llmtuner/model/loader.py +++ b/src/llmtuner/model/loader.py @@ -40,7 +40,9 @@ def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]: } -def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer": +def load_tokenizer( + model_args: "ModelArguments", +) -> Dict[str, Union["PreTrainedTokenizer", "AutoProcesser"]]: r""" Loads pretrained tokenizer. @@ -78,33 +80,25 @@ def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer": ) patch_tokenizer(tokenizer) - return tokenizer - - -def load_processor(model_args: "ModelArguments") -> "AutoProcessor": - r""" - Loads processor. Must before load_model. - - Note: including inplace operation of model_args. - """ - init_kwargs = _get_init_kwargs(model_args) - try: - processor = AutoProcessor.from_pretrained( - model_args.model_name_or_path, - use_fast=model_args.use_fast_tokenizer, - split_special_tokens=model_args.split_special_tokens, - padding_side="right", - **init_kwargs, - ) - except Exception: # try the fast one - processor = AutoProcessor.from_pretrained( - model_args.model_name_or_path, - use_fast=True, - padding_side="right", - **init_kwargs, - ) - - return processor + tokenizer_modules = {"tokenizer": tokenizer, "processor": None} + if model_args.use_mllm: + try: + processor = AutoProcessor.from_pretrained( + model_args.model_name_or_path, + use_fast=model_args.use_fast_tokenizer, + split_special_tokens=model_args.split_special_tokens, + padding_side="right", + **init_kwargs, + ) + except Exception: # try the fast one + processor = AutoProcessor.from_pretrained( + model_args.model_name_or_path, + use_fast=True, + padding_side="right", + **init_kwargs, + ) + tokenizer_modules["processor"] = processor + return tokenizer_modules def load_config(model_args: "ModelArguments") -> "PretrainedConfig": diff --git a/src/llmtuner/train/sft/workflow.py b/src/llmtuner/train/sft/workflow.py index 9ab78850..6f887810 100644 --- a/src/llmtuner/train/sft/workflow.py +++ b/src/llmtuner/train/sft/workflow.py @@ -17,7 +17,12 @@ from .trainer import CustomSeq2SeqTrainer if TYPE_CHECKING: from transformers import Seq2SeqTrainingArguments, TrainerCallback - from ...hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments + from ...hparams import ( + DataArguments, + FinetuningArguments, + GeneratingArguments, + ModelArguments, + ) def run_sft( @@ -28,25 +33,48 @@ def run_sft( generating_args: "GeneratingArguments", callbacks: Optional[List["TrainerCallback"]] = None, ): - tokenizer = load_tokenizer(model_args) - dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="sft") + tokenizer_modules = load_tokenizer(model_args) + tokenizer = tokenizer_modules["tokenizer"] + processor = tokenizer_modules["processor"] + dataset = get_dataset( + tokenizer, + model_args, + data_args, + training_args, + stage="sft", + processor=processor, + ) model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) if training_args.predict_with_generate: tokenizer.padding_side = "left" # use left-padding in generation if getattr(model, "is_quantized", False) and not training_args.do_train: - setattr(model, "_hf_peft_config_loaded", True) # hack here: make model compatible with prediction + setattr( + model, "_hf_peft_config_loaded", True + ) # hack here: make model compatible with prediction data_collator = DataCollatorForSeq2Seq( tokenizer=tokenizer, - pad_to_multiple_of=8 if tokenizer.padding_side == "right" else None, # for shift short attention - label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id, + pad_to_multiple_of=( + 8 if tokenizer.padding_side == "right" else None + ), # for shift short attention + label_pad_token_id=( + IGNORE_INDEX + if data_args.ignore_pad_token_for_loss + else tokenizer.pad_token_id + ), ) # Override the decoding parameters of Seq2SeqTrainer - training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len - training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams + training_args.generation_max_length = ( + training_args.generation_max_length or data_args.cutoff_len + ) + training_args.generation_num_beams = ( + data_args.eval_num_beams or training_args.generation_num_beams + ) + if model_args.use_mllm: + training_args.remove_unused_columns = False # Initialize our Trainer trainer = CustomSeq2SeqTrainer( @@ -56,19 +84,25 @@ def run_sft( tokenizer=tokenizer, data_collator=data_collator, callbacks=callbacks, - compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None, + compute_metrics=( + ComputeMetrics(tokenizer) if training_args.predict_with_generate else None + ), **split_dataset(dataset, data_args, training_args), ) # Keyword arguments for `model.generate` gen_kwargs = generating_args.to_dict() - gen_kwargs["eos_token_id"] = [tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids + gen_kwargs["eos_token_id"] = [ + tokenizer.eos_token_id + ] + tokenizer.additional_special_tokens_ids gen_kwargs["pad_token_id"] = tokenizer.pad_token_id gen_kwargs["logits_processor"] = get_logits_processor() # Training if training_args.do_train: - train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + train_result = trainer.train( + resume_from_checkpoint=training_args.resume_from_checkpoint + ) trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) @@ -79,19 +113,27 @@ def run_sft( # Evaluation if training_args.do_eval: metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs) - if training_args.predict_with_generate: # eval_loss will be wrong if predict_with_generate is enabled + if ( + training_args.predict_with_generate + ): # eval_loss will be wrong if predict_with_generate is enabled metrics.pop("eval_loss", None) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Predict if training_args.do_predict: - predict_results = trainer.predict(dataset, metric_key_prefix="predict", **gen_kwargs) - if training_args.predict_with_generate: # predict_loss will be wrong if predict_with_generate is enabled + predict_results = trainer.predict( + dataset, metric_key_prefix="predict", **gen_kwargs + ) + if ( + training_args.predict_with_generate + ): # predict_loss will be wrong if predict_with_generate is enabled predict_results.metrics.pop("predict_loss", None) trainer.log_metrics("predict", predict_results.metrics) trainer.save_metrics("predict", predict_results.metrics) trainer.save_predictions(predict_results) # Create model card - create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) + create_modelcard_and_push( + trainer, model_args, data_args, training_args, finetuning_args + ) diff --git a/src/llmtuner/train/sftmm/__init__.py b/src/llmtuner/train/sftmm/__init__.py deleted file mode 100644 index 3eb8b2e2..00000000 --- a/src/llmtuner/train/sftmm/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .workflow import run_sft_mm - -__all__ = ["run_sft_mm"] diff --git a/src/llmtuner/train/sftmm/metric.py b/src/llmtuner/train/sftmm/metric.py deleted file mode 100644 index d1af4c17..00000000 --- a/src/llmtuner/train/sftmm/metric.py +++ /dev/null @@ -1,61 +0,0 @@ -from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, Sequence, Tuple, Union - -import numpy as np - -from ...extras.constants import IGNORE_INDEX -from ...extras.packages import is_jieba_available, is_nltk_available, is_rouge_available - - -if TYPE_CHECKING: - from transformers.tokenization_utils import PreTrainedTokenizer - -if is_jieba_available(): - import jieba # type: ignore - -if is_nltk_available(): - from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu - -if is_rouge_available(): - from rouge_chinese import Rouge - - -@dataclass -class ComputeMetrics: - r""" - Wraps the tokenizer into metric functions, used in Seq2SeqPeftTrainer. - """ - - tokenizer: "PreTrainedTokenizer" - - def __call__(self, eval_preds: Sequence[Union[np.ndarray, Tuple[np.ndarray]]]) -> Dict[str, float]: - r""" - Uses the model predictions to compute metrics. - """ - preds, labels = eval_preds - score_dict = {"rouge-1": [], "rouge-2": [], "rouge-l": [], "bleu-4": []} - - preds = np.where(preds != IGNORE_INDEX, preds, self.tokenizer.pad_token_id) - labels = np.where(labels != IGNORE_INDEX, labels, self.tokenizer.pad_token_id) - - decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) - decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) - - for pred, label in zip(decoded_preds, decoded_labels): - hypothesis = list(jieba.cut(pred)) - reference = list(jieba.cut(label)) - - if len(" ".join(hypothesis).split()) == 0 or len(" ".join(reference).split()) == 0: - result = {"rouge-1": {"f": 0.0}, "rouge-2": {"f": 0.0}, "rouge-l": {"f": 0.0}} - else: - rouge = Rouge() - scores = rouge.get_scores(" ".join(hypothesis), " ".join(reference)) - result = scores[0] - - for k, v in result.items(): - score_dict[k].append(round(v["f"] * 100, 4)) - - bleu_score = sentence_bleu([list(label)], list(pred), smoothing_function=SmoothingFunction().method3) - score_dict["bleu-4"].append(round(bleu_score * 100, 4)) - - return {k: float(np.mean(v)) for k, v in score_dict.items()} diff --git a/src/llmtuner/train/sftmm/trainer.py b/src/llmtuner/train/sftmm/trainer.py deleted file mode 100644 index f094e609..00000000 --- a/src/llmtuner/train/sftmm/trainer.py +++ /dev/null @@ -1,44 +0,0 @@ -import json -import os -from types import MethodType -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union - -import numpy as np -import torch -from transformers import Seq2SeqTrainer, Trainer - -from ...extras.constants import IGNORE_INDEX -from ...extras.logging import get_logger -from ..utils import create_custom_optimzer, create_custom_scheduler - -if TYPE_CHECKING: - from transformers.trainer import PredictionOutput - from peft import PeftModelForCausalLM - from ...hparams import FinetuningArguments - -logger = get_logger(__name__) - - -class CustomSeq2SeqTrainer(Seq2SeqTrainer): - r""" - Inherits Seq2SeqTrainer to compute generative metrics such as BLEU and ROUGE. - """ - - def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None: - super().__init__(**kwargs) - self.finetuning_args = finetuning_args - if finetuning_args.use_badam: - from badam import clip_grad_norm_for_sparse_tensor - - self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) - - def create_optimizer(self) -> "torch.optim.Optimizer": - if self.optimizer is None: - self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args) - return super().create_optimizer() - - def create_scheduler( - self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None - ) -> "torch.optim.lr_scheduler.LRScheduler": - create_custom_scheduler(self.args, num_training_steps, optimizer) - return super().create_scheduler(num_training_steps, optimizer) diff --git a/src/llmtuner/train/sftmm/workflow.py b/src/llmtuner/train/sftmm/workflow.py deleted file mode 100644 index 3849a563..00000000 --- a/src/llmtuner/train/sftmm/workflow.py +++ /dev/null @@ -1,127 +0,0 @@ -# Inspired by: https://github.com/huggingface/transformers/blob/v4.34.1/examples/pytorch/summarization/run_summarization.py -import os -from typing import TYPE_CHECKING, List, Optional -from ...data import get_dataset -from ...extras.misc import get_logits_processor -from ...extras.ploting import plot_loss -from ...model import load_processor, load_model -from ..utils import create_modelcard_and_push -from .metric import ComputeMetrics -from .trainer import CustomSeq2SeqTrainer -from transformers import DataCollatorForSeq2Seq -from ...extras.constants import IGNORE_INDEX - -if TYPE_CHECKING: - from transformers import Seq2SeqTrainingArguments, TrainerCallback - - from ...hparams import ( - DataArguments, - FinetuningArguments, - GeneratingArguments, - ModelArguments, - ) - - -def run_sft_mm( - model_args: "ModelArguments", - data_args: "DataArguments", - training_args: "Seq2SeqTrainingArguments", - finetuning_args: "FinetuningArguments", - generating_args: "GeneratingArguments", - callbacks: Optional[List["TrainerCallback"]] = None, -): - processor = load_processor(model_args) - tokenizer = processor.tokenizer - dataset = get_dataset( - tokenizer, model_args, data_args, training_args, "sft", processor - ) - model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) - if getattr(model, "is_quantized", False) and not training_args.do_train: - setattr( - model, "_hf_peft_config_loaded", True - ) # hack here: make model compatible with prediction - train_dataset = dataset - eval_dataset = dataset - data_collator = DataCollatorForSeq2Seq( - tokenizer=tokenizer, - pad_to_multiple_of=( - 8 if tokenizer.padding_side == "right" else None - ), # for shift short attention - label_pad_token_id=( - IGNORE_INDEX - if data_args.ignore_pad_token_for_loss - else tokenizer.pad_token_id - ), - ) - - # Override the decoding parameters of Seq2SeqTrainer - training_args.generation_max_length = ( - training_args.generation_max_length or data_args.cutoff_len - ) - training_args.generation_num_beams = ( - data_args.eval_num_beams or training_args.generation_num_beams - ) - training_args.remove_unused_columns = False - - # Initialize our Trainer - trainer = CustomSeq2SeqTrainer( - model=model, - args=training_args, - finetuning_args=finetuning_args, - tokenizer=tokenizer, - data_collator=data_collator, - callbacks=callbacks, - compute_metrics=( - ComputeMetrics(tokenizer) if training_args.predict_with_generate else None - ), - train_dataset=train_dataset, - eval_dataset=eval_dataset, - ) - - # Keyword arguments for `model.generate` - gen_kwargs = generating_args.to_dict() - gen_kwargs["eos_token_id"] = [ - tokenizer.eos_token_id - ] + tokenizer.additional_special_tokens_ids - gen_kwargs["pad_token_id"] = tokenizer.pad_token_id - gen_kwargs["logits_processor"] = get_logits_processor() - - # Training - if training_args.do_train: - train_result = trainer.train( - resume_from_checkpoint=training_args.resume_from_checkpoint - ) - trainer.save_model() - trainer.log_metrics("train", train_result.metrics) - trainer.save_metrics("train", train_result.metrics) - trainer.save_state() - if trainer.is_world_process_zero() and finetuning_args.plot_loss: - plot_loss(training_args.output_dir, keys=["loss", "eval_loss"]) - - # Evaluation - if training_args.do_eval: - metrics = trainer.evaluate(metric_key_prefix="eval", **gen_kwargs) - if ( - training_args.predict_with_generate - ): # eval_loss will be wrong if predict_with_generate is enabled - metrics.pop("eval_loss", None) - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - - # Predict - if training_args.do_predict: - predict_results = trainer.predict( - dataset, metric_key_prefix="predict", **gen_kwargs - ) - if ( - training_args.predict_with_generate - ): # predict_loss will be wrong if predict_with_generate is enabled - predict_results.metrics.pop("predict_loss", None) - trainer.log_metrics("predict", predict_results.metrics) - trainer.save_metrics("predict", predict_results.metrics) - trainer.save_predictions(predict_results) - - # Create model card - create_modelcard_and_push( - trainer, model_args, data_args, training_args, finetuning_args - ) diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py index ac56289c..5f691225 100644 --- a/src/llmtuner/train/tuner.py +++ b/src/llmtuner/train/tuner.py @@ -14,7 +14,6 @@ from .ppo import run_ppo from .pt import run_pt from .rm import run_rm from .sft import run_sft -from .sftmm import run_sft_mm if TYPE_CHECKING: from transformers import TrainerCallback @@ -30,8 +29,6 @@ def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["Tra run_pt(model_args, data_args, training_args, finetuning_args, callbacks) elif finetuning_args.stage == "sft": run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) - elif finetuning_args.stage == "sft_mm": - run_sft_mm(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) elif finetuning_args.stage == "rm": run_rm(model_args, data_args, training_args, finetuning_args, callbacks) elif finetuning_args.stage == "ppo":