From 148bda353f0b53af022c51da9a9e59a56f341510 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Wed, 3 Apr 2024 18:14:24 +0800 Subject: [PATCH] fix resize vocab at inference #3022 --- scripts/cal_lr.py | 8 ++++---- scripts/length_cdf.py | 6 +++--- setup.py | 2 +- src/llmtuner/chat/hf_engine.py | 9 +++++---- src/llmtuner/eval/evaluator.py | 5 +++-- src/llmtuner/model/__init__.py | 3 +-- src/llmtuner/model/loader.py | 16 +--------------- src/llmtuner/train/tuner.py | 5 +++-- src/llmtuner/train/utils.py | 17 ++++++++++------- 9 files changed, 31 insertions(+), 40 deletions(-) diff --git a/scripts/cal_lr.py b/scripts/cal_lr.py index 6decf0c2..ffe47f28 100644 --- a/scripts/cal_lr.py +++ b/scripts/cal_lr.py @@ -15,7 +15,7 @@ from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq from llmtuner.data import get_dataset from llmtuner.extras.constants import IGNORE_INDEX from llmtuner.hparams import get_train_args -from llmtuner.model import load_model_and_tokenizer +from llmtuner.model import load_tokenizer BASE_LR = 3e-4 # 1.5e-4 for 30B-70B models @@ -32,7 +32,7 @@ def calculate_lr( cutoff_len: Optional[int] = 1024, # i.e. maximum input length during training is_mistral: Optional[bool] = False, # mistral model uses a smaller learning rate, ): - model_args, data_args, training_args, finetuning_args, _ = get_train_args( + model_args, data_args, training_args, _, _ = get_train_args( dict( stage=stage, model_name_or_path=model_name_or_path, @@ -44,8 +44,8 @@ def calculate_lr( overwrite_cache=True, ) ) - _, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, is_trainable=False, add_valuehead=False) - trainset = get_dataset(tokenizer, model_args, data_args, training_args, stage=stage) + tokenizer = load_tokenizer(model_args) + trainset = get_dataset(tokenizer, model_args, data_args, training_args, stage) if stage == "pt": data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) elif stage == "sft": diff --git a/scripts/length_cdf.py b/scripts/length_cdf.py index d9cb06f5..cf0698de 100644 --- a/scripts/length_cdf.py +++ b/scripts/length_cdf.py @@ -10,7 +10,7 @@ from tqdm import tqdm from llmtuner.data import get_dataset from llmtuner.hparams import get_train_args -from llmtuner.model import load_model_and_tokenizer +from llmtuner.model import load_tokenizer def length_cdf( @@ -20,7 +20,7 @@ def length_cdf( template: Optional[str] = "default", interval: Optional[int] = 1000, ): - model_args, data_args, training_args, finetuning_args, _ = get_train_args( + model_args, data_args, training_args, _, _ = get_train_args( dict( stage="sft", model_name_or_path=model_name_or_path, @@ -32,7 +32,7 @@ def length_cdf( overwrite_cache=True, ) ) - _, tokenizer = load_model_and_tokenizer(model_args, finetuning_args, is_trainable=False, add_valuehead=False) + tokenizer = load_tokenizer(model_args) trainset = get_dataset(tokenizer, model_args, data_args, training_args, stage="sft") total_num = len(trainset) length_dict = defaultdict(int) diff --git a/setup.py b/setup.py index 8d6c2031..2caee7a8 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ def get_requires(): extra_require = { - "deepspeed": ["deepspeed"], + "deepspeed": ["deepspeed>=0.10.0"], "metrics": ["nltk", "jieba", "rouge-chinese"], "unsloth": ["torch==2.2.0", "unsloth[cu121-ampere-torch220]"], "vllm": ["vllm>=0.3.3"], diff --git a/src/llmtuner/chat/hf_engine.py b/src/llmtuner/chat/hf_engine.py index c634ba16..bcdbd15a 100644 --- a/src/llmtuner/chat/hf_engine.py +++ b/src/llmtuner/chat/hf_engine.py @@ -9,7 +9,7 @@ from transformers import GenerationConfig, TextIteratorStreamer from ..data import get_template_and_fix_tokenizer from ..extras.misc import get_logits_processor -from ..model import load_model_and_tokenizer +from ..model import load_model, load_tokenizer from .base_engine import BaseEngine, Response @@ -30,11 +30,12 @@ class HuggingfaceEngine(BaseEngine): generating_args: "GeneratingArguments", ) -> None: self.can_generate = finetuning_args.stage == "sft" - self.model, self.tokenizer = load_model_and_tokenizer( - model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate) - ) + self.tokenizer = load_tokenizer(model_args) self.tokenizer.padding_side = "left" if self.can_generate else "right" self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template) + self.model = load_model( + self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate) + ) self.generating_args = generating_args.to_dict() @staticmethod diff --git a/src/llmtuner/eval/evaluator.py b/src/llmtuner/eval/evaluator.py index 4969561f..2c039928 100644 --- a/src/llmtuner/eval/evaluator.py +++ b/src/llmtuner/eval/evaluator.py @@ -14,16 +14,17 @@ from transformers.utils import cached_file from ..data import get_template_and_fix_tokenizer from ..extras.constants import CHOICES, SUBJECTS from ..hparams import get_eval_args -from ..model import load_model_and_tokenizer +from ..model import load_model, load_tokenizer from .template import get_eval_template class Evaluator: def __init__(self, args: Optional[Dict[str, Any]] = None) -> None: self.model_args, self.data_args, self.eval_args, finetuning_args = get_eval_args(args) - self.model, self.tokenizer = load_model_and_tokenizer(self.model_args, finetuning_args) + self.tokenizer = load_tokenizer(self.model_args) self.tokenizer.padding_side = "right" # avoid overflow issue in batched inference for llama2 self.template = get_template_and_fix_tokenizer(self.tokenizer, self.data_args.template) + self.model = load_model(self.tokenizer, self.model_args, finetuning_args) self.eval_template = get_eval_template(self.eval_args.lang) self.choice_inputs = [ self.tokenizer.encode(self.eval_template.prefix + ch, add_special_tokens=False)[-1] for ch in CHOICES diff --git a/src/llmtuner/model/__init__.py b/src/llmtuner/model/__init__.py index 4b1b26fc..1eaf4271 100644 --- a/src/llmtuner/model/__init__.py +++ b/src/llmtuner/model/__init__.py @@ -1,10 +1,9 @@ -from .loader import load_model, load_model_and_tokenizer, load_tokenizer +from .loader import load_model, load_tokenizer from .utils import find_all_linear_modules, load_valuehead_params __all__ = [ "load_model", - "load_model_and_tokenizer", "load_tokenizer", "load_valuehead_params", "find_all_linear_modules", diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py index d05c0886..e91a7b68 100644 --- a/src/llmtuner/model/loader.py +++ b/src/llmtuner/model/loader.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Any, Dict, Tuple +from typing import TYPE_CHECKING, Any, Dict from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from trl import AutoModelForCausalLMWithValueHead @@ -133,17 +133,3 @@ def load_model( ) return model - - -def load_model_and_tokenizer( - model_args: "ModelArguments", - finetuning_args: "FinetuningArguments", - is_trainable: bool = False, - add_valuehead: bool = False, -) -> Tuple["PreTrainedModel", "PreTrainedTokenizer"]: - r""" - Loads pretrained model and tokenizer. - """ - tokenizer = load_tokenizer(model_args) - model = load_model(tokenizer, model_args, finetuning_args, is_trainable, add_valuehead) - return model, tokenizer diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py index 299e4f2a..f6c2e16b 100644 --- a/src/llmtuner/train/tuner.py +++ b/src/llmtuner/train/tuner.py @@ -7,7 +7,7 @@ from ..data import get_template_and_fix_tokenizer from ..extras.callbacks import LogCallback from ..extras.logging import get_logger from ..hparams import get_infer_args, get_train_args -from ..model import load_model_and_tokenizer +from ..model import load_model, load_tokenizer from .dpo import run_dpo from .orpo import run_orpo from .ppo import run_ppo @@ -52,8 +52,9 @@ def export_model(args: Optional[Dict[str, Any]] = None): if model_args.adapter_name_or_path is not None and model_args.export_quantization_bit is not None: raise ValueError("Please merge adapters before quantizing the model.") - model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args) + tokenizer = load_tokenizer(model_args) get_template_and_fix_tokenizer(tokenizer, data_args.template) + model = load_model(tokenizer, model_args, finetuning_args) # must after fixing tokenizer to resize vocab if getattr(model, "quantization_method", None) and model_args.adapter_name_or_path is not None: raise ValueError("Cannot merge adapters to a quantized model.") diff --git a/src/llmtuner/train/utils.py b/src/llmtuner/train/utils.py index 8f218a78..cf199633 100644 --- a/src/llmtuner/train/utils.py +++ b/src/llmtuner/train/utils.py @@ -10,7 +10,7 @@ from transformers.utils.versions import require_version from ..extras.logging import get_logger from ..extras.packages import is_galore_available from ..hparams import FinetuningArguments, ModelArguments -from ..model import find_all_linear_modules, load_model_and_tokenizer, load_valuehead_params +from ..model import find_all_linear_modules, load_model, load_tokenizer, load_valuehead_params if is_galore_available(): @@ -87,16 +87,18 @@ def create_ref_model( ) ref_model_args = ModelArguments(**ref_model_args_dict) ref_finetuning_args = FinetuningArguments(finetuning_type="lora") - ref_model, _ = load_model_and_tokenizer( - ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead + tokenizer = load_tokenizer(ref_model_args) + ref_model = load_model( + tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead ) logger.info("Created reference model from {}".format(finetuning_args.ref_model)) else: if finetuning_args.finetuning_type == "lora": ref_model = None else: - ref_model, _ = load_model_and_tokenizer( - model_args, finetuning_args, is_trainable=False, add_valuehead=add_valuehead + tokenizer = load_tokenizer(model_args) + ref_model = load_model( + tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=add_valuehead ) logger.info("Created reference model from the model itself.") @@ -141,8 +143,9 @@ def create_reward_model( ) reward_model_args = ModelArguments(**reward_model_args_dict) reward_finetuning_args = FinetuningArguments(finetuning_type="lora") - reward_model, _ = load_model_and_tokenizer( - reward_model_args, reward_finetuning_args, is_trainable=False, add_valuehead=True + tokenizer = load_tokenizer(reward_model_args) + reward_model = load_model( + tokenizer, reward_model_args, reward_finetuning_args, is_trainable=False, add_valuehead=True ) logger.info("Loaded full weights of reward model from {}".format(finetuning_args.reward_model)) logger.warning("Please ensure the ppo model and reward model share SAME tokenizer and vocabulary.")