diff --git a/src/llmtuner/chat/vllm_engine.py b/src/llmtuner/chat/vllm_engine.py index aaaad2f1..8d602655 100644 --- a/src/llmtuner/chat/vllm_engine.py +++ b/src/llmtuner/chat/vllm_engine.py @@ -2,9 +2,11 @@ import uuid from typing import TYPE_CHECKING, AsyncGenerator, AsyncIterator, Dict, List, Optional, Sequence from ..data import get_template_and_fix_tokenizer +from ..extras.logging import get_logger from ..extras.misc import get_device_count, infer_optim_dtype from ..extras.packages import is_vllm_available from ..model import load_config, load_tokenizer +from ..model.utils.visual import LlavaMultiModalProjectorForYiVLForVLLM from .base_engine import BaseEngine, Response @@ -22,6 +24,9 @@ if TYPE_CHECKING: from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments +logger = get_logger(__name__) + + class VllmEngine(BaseEngine): def __init__( self, @@ -57,13 +62,19 @@ class VllmEngine(BaseEngine): } if model_args.visual_inputs: - # TODO: auto derive from config - # https://github.com/vllm-project/vllm/pull/3042#issuecomment-1984893549 - self.image_feature_size = 576 + image_size = config.vision_config.image_size + patch_size = config.vision_config.patch_size + self.image_feature_size = (image_size // patch_size) ** 2 engine_args["image_input_type"] = "pixel_values" engine_args["image_token_id"] = self.tokenizer.convert_tokens_to_ids("") - engine_args["image_input_shape"] = "1,3,336,336" + engine_args["image_input_shape"] = "1,3,{},{}".format(image_size, image_size) engine_args["image_feature_size"] = self.image_feature_size + if getattr(config, "is_yi_vl_derived_model", None): + # bug in vllm 0.4.2, see: https://github.com/vllm-project/vllm/pull/4828 + import vllm.model_executor.models.llava + + logger.info("Detected Yi-VL model, applying projector patch.") + vllm.model_executor.models.llava.LlavaMultiModalProjector = LlavaMultiModalProjectorForYiVLForVLLM self.model = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**engine_args)) if model_args.adapter_name_or_path is not None: diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py index b20c9203..b7a34b59 100644 --- a/src/llmtuner/data/template.py +++ b/src/llmtuner/data/template.py @@ -865,7 +865,7 @@ _register_template( "Assume the role of the AI assistant. Read all the images carefully, " "and respond to the human's questions with informative, helpful, detailed and polite answers. " "这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。" - "仔细阅读所有的图像,并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n" + "仔细阅读所有的图像,并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n\n" ), stop_words=["###"], ) diff --git a/src/llmtuner/hparams/parser.py b/src/llmtuner/hparams/parser.py index 7fdd3234..20f9a003 100644 --- a/src/llmtuner/hparams/parser.py +++ b/src/llmtuner/hparams/parser.py @@ -285,7 +285,7 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: model_args.model_max_length = data_args.cutoff_len data_args.packing = data_args.packing if data_args.packing is not None else finetuning_args.stage == "pt" - # Log on each process the small summary: + # Log on each process the small summary logger.info( "Process rank: {}, device: {}, n_gpu: {}, distributed training: {}, compute dtype: {}".format( training_args.local_rank, diff --git a/src/llmtuner/model/utils/visual.py b/src/llmtuner/model/utils/visual.py index 9a5134ff..33fb394d 100644 --- a/src/llmtuner/model/utils/visual.py +++ b/src/llmtuner/model/utils/visual.py @@ -16,25 +16,51 @@ if TYPE_CHECKING: logger = get_logger(__name__) -class LlavaMultiModalProjector(torch.nn.Module): - def __init__(self, config: "LlavaConfig"): +class LlavaMultiModalProjectorForYiVL(torch.nn.Module): + def __init__(self, config: "LlavaConfig") -> None: super().__init__() + self.config = config + if config is None: + return + self.linear_1 = torch.nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True) self.linear_2 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True) self.linear_3 = torch.nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True) self.linear_4 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True) self.act = ACT2FN[config.projector_hidden_act] - def forward(self, image_features): + def forward(self, image_features: "torch.Tensor") -> "torch.Tensor": hidden_states = self.linear_1(image_features) hidden_states = self.linear_2(hidden_states) hidden_states = self.act(hidden_states) hidden_states = self.linear_3(hidden_states) hidden_states = self.linear_4(hidden_states) + if hidden_states.dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.linear_1.weight.dtype + + logger.warning_once("The hidden states seems to be silently casted in float32.") + hidden_states = hidden_states.to(target_dtype) + return hidden_states +class LlavaMultiModalProjectorForYiVLForVLLM(LlavaMultiModalProjectorForYiVL): + def __init__(self, vision_hidden_size: int, text_hidden_size: int, projector_hidden_act: str) -> None: + super().__init__(config=None) + + self.linear_1 = torch.nn.Linear(vision_hidden_size, text_hidden_size, bias=True) + self.linear_2 = torch.nn.LayerNorm(text_hidden_size, bias=True) + self.linear_3 = torch.nn.Linear(text_hidden_size, text_hidden_size, bias=True) + self.linear_4 = torch.nn.LayerNorm(text_hidden_size, bias=True) + self.act = torch.nn.GELU() + + def autocast_projector_dtype( model: "PreTrainedModel", model_args: "ModelArguments", mm_projector_name: str = "multi_modal_projector" ) -> None: @@ -53,5 +79,6 @@ def configure_visual_model(config: "PretrainedConfig") -> None: if getattr(config, "model_type", None) == "llava": setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None)) - if getattr(config, "is_yi_vl_derived_model", None): - transformers.models.llava.modeling_llava.LlavaMultiModalProjector = LlavaMultiModalProjector + if getattr(config, "is_yi_vl_derived_model", None): + logger.info("Detected Yi-VL model, applying projector patch.") + transformers.models.llava.modeling_llava.LlavaMultiModalProjector = LlavaMultiModalProjectorForYiVL diff --git a/src/llmtuner/train/dpo/trainer.py b/src/llmtuner/train/dpo/trainer.py index b144d561..3c0b0276 100644 --- a/src/llmtuner/train/dpo/trainer.py +++ b/src/llmtuner/train/dpo/trainer.py @@ -13,7 +13,7 @@ from ..utils import create_custom_optimzer, create_custom_scheduler if TYPE_CHECKING: - from transformers import PreTrainedModel + from transformers import PreTrainedModel, ProcessorMixin from ...hparams import FinetuningArguments @@ -24,6 +24,7 @@ class CustomDPOTrainer(DPOTrainer): model: Union["PreTrainedModel", torch.nn.Module], ref_model: Optional[Union["PreTrainedModel", torch.nn.Module]], finetuning_args: "FinetuningArguments", + processor: Optional["ProcessorMixin"], disable_dropout: bool = True, **kwargs, ): @@ -33,6 +34,7 @@ class CustomDPOTrainer(DPOTrainer): disable_dropout_in_model(ref_model) self.finetuning_args = finetuning_args + self.processor = processor self.reference_free = False self.use_dpo_data_collator = True # hack to avoid warning self.generate_during_eval = False # disable at evaluation @@ -80,6 +82,12 @@ class CustomDPOTrainer(DPOTrainer): create_custom_scheduler(self.args, num_training_steps, optimizer) return super().create_scheduler(num_training_steps, optimizer) + def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None: + super()._save(output_dir, state_dict) + if self.processor is not None: + output_dir = output_dir if output_dir is not None else self.args.output_dir + getattr(self.processor, "image_processor").save_pretrained(output_dir) + def sft_loss(self, chosen_logits: "torch.FloatTensor", chosen_labels: "torch.LongTensor") -> "torch.Tensor": r""" Computes supervised cross-entropy loss of given labels under the given logits. diff --git a/src/llmtuner/train/dpo/workflow.py b/src/llmtuner/train/dpo/workflow.py index b19a643e..8ac4952a 100644 --- a/src/llmtuner/train/dpo/workflow.py +++ b/src/llmtuner/train/dpo/workflow.py @@ -50,9 +50,9 @@ def run_dpo( ref_model=ref_model, args=training_args, finetuning_args=finetuning_args, - tokenizer=tokenizer, data_collator=data_collator, callbacks=callbacks, + **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) diff --git a/src/llmtuner/train/orpo/trainer.py b/src/llmtuner/train/orpo/trainer.py index 88090a9e..1b743647 100644 --- a/src/llmtuner/train/orpo/trainer.py +++ b/src/llmtuner/train/orpo/trainer.py @@ -13,7 +13,7 @@ from ..utils import create_custom_optimzer, create_custom_scheduler if TYPE_CHECKING: - from transformers import PreTrainedModel + from transformers import PreTrainedModel, ProcessorMixin from ...hparams import FinetuningArguments @@ -23,6 +23,7 @@ class CustomORPOTrainer(DPOTrainer): self, model: Union["PreTrainedModel", "torch.nn.Module"], finetuning_args: "FinetuningArguments", + processor: Optional["ProcessorMixin"], disable_dropout: bool = True, **kwargs, ): @@ -30,6 +31,7 @@ class CustomORPOTrainer(DPOTrainer): disable_dropout_in_model(model) self.finetuning_args = finetuning_args + self.processor = processor self.reference_free = False self.use_dpo_data_collator = True # hack to avoid warning self.generate_during_eval = False # disable at evaluation @@ -61,6 +63,12 @@ class CustomORPOTrainer(DPOTrainer): create_custom_scheduler(self.args, num_training_steps, optimizer) return super().create_scheduler(num_training_steps, optimizer) + def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None: + super()._save(output_dir, state_dict) + if self.processor is not None: + output_dir = output_dir if output_dir is not None else self.args.output_dir + getattr(self.processor, "image_processor").save_pretrained(output_dir) + def odds_ratio_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor": r""" Computes ORPO's odds ratio (OR) loss. diff --git a/src/llmtuner/train/orpo/workflow.py b/src/llmtuner/train/orpo/workflow.py index 9c870096..6ea18dae 100644 --- a/src/llmtuner/train/orpo/workflow.py +++ b/src/llmtuner/train/orpo/workflow.py @@ -43,9 +43,9 @@ def run_orpo( model=model, args=training_args, finetuning_args=finetuning_args, - tokenizer=tokenizer, data_collator=data_collator, callbacks=callbacks, + **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) diff --git a/src/llmtuner/train/ppo/trainer.py b/src/llmtuner/train/ppo/trainer.py index ef769968..985664b7 100644 --- a/src/llmtuner/train/ppo/trainer.py +++ b/src/llmtuner/train/ppo/trainer.py @@ -23,7 +23,13 @@ from .utils import dump_layernorm, get_rewards_from_server, replace_model, resto if TYPE_CHECKING: from datasets import Dataset - from transformers import DataCollatorWithPadding, PreTrainedTokenizer, Seq2SeqTrainingArguments, TrainerCallback + from transformers import ( + DataCollatorWithPadding, + PreTrainedTokenizer, + ProcessorMixin, + Seq2SeqTrainingArguments, + TrainerCallback, + ) from trl import AutoModelForCausalLMWithValueHead from ...hparams import FinetuningArguments, GeneratingArguments, ModelArguments @@ -48,6 +54,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer): reward_model: Optional["AutoModelForCausalLMWithValueHead"], ref_model: Optional["AutoModelForCausalLMWithValueHead"], tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], dataset: "Dataset", data_collator: "DataCollatorWithPadding", ): @@ -97,6 +104,7 @@ class CustomPPOTrainer(PPOTrainer, Trainer): self.finetuning_args = finetuning_args self.reward_model = reward_model self.current_device = get_current_device() # patch for deepspeed training + self.processor = processor self.generation_config = GenerationConfig( pad_token_id=self.tokenizer.pad_token_id, @@ -295,6 +303,12 @@ class CustomPPOTrainer(PPOTrainer, Trainer): ) return lr_scheduler + def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None: + super()._save(output_dir, state_dict) + if self.processor is not None: + output_dir = output_dir if output_dir is not None else self.args.output_dir + getattr(self.processor, "image_processor").save_pretrained(output_dir) + @torch.no_grad() def get_inputs(self, batch: Dict[str, torch.Tensor]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: r""" diff --git a/src/llmtuner/train/ppo/workflow.py b/src/llmtuner/train/ppo/workflow.py index 8cd15932..4383bcdc 100644 --- a/src/llmtuner/train/ppo/workflow.py +++ b/src/llmtuner/train/ppo/workflow.py @@ -49,9 +49,9 @@ def run_ppo( model=model, reward_model=reward_model, ref_model=ref_model, - tokenizer=tokenizer, dataset=dataset, data_collator=data_collator, + **tokenizer_module, ) # Training diff --git a/src/llmtuner/train/pt/trainer.py b/src/llmtuner/train/pt/trainer.py index 969ebf04..b7b80f88 100644 --- a/src/llmtuner/train/pt/trainer.py +++ b/src/llmtuner/train/pt/trainer.py @@ -1,5 +1,5 @@ from types import MethodType -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Dict, Optional from transformers import Trainer @@ -9,6 +9,7 @@ from ..utils import create_custom_optimzer, create_custom_scheduler if TYPE_CHECKING: import torch + from transformers import ProcessorMixin from ...hparams import FinetuningArguments @@ -21,9 +22,12 @@ class CustomTrainer(Trainer): Inherits Trainer for custom optimizer. """ - def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None: + def __init__( + self, finetuning_args: "FinetuningArguments", processor: Optional["ProcessorMixin"], **kwargs + ) -> None: super().__init__(**kwargs) self.finetuning_args = finetuning_args + self.processor = processor if finetuning_args.use_badam: from badam import clip_grad_norm_for_sparse_tensor @@ -39,3 +43,9 @@ class CustomTrainer(Trainer): ) -> "torch.optim.lr_scheduler.LRScheduler": create_custom_scheduler(self.args, num_training_steps, optimizer) return super().create_scheduler(num_training_steps, optimizer) + + def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None: + super()._save(output_dir, state_dict) + if self.processor is not None: + output_dir = output_dir if output_dir is not None else self.args.output_dir + getattr(self.processor, "image_processor").save_pretrained(output_dir) diff --git a/src/llmtuner/train/pt/workflow.py b/src/llmtuner/train/pt/workflow.py index 3b127da4..9f945901 100644 --- a/src/llmtuner/train/pt/workflow.py +++ b/src/llmtuner/train/pt/workflow.py @@ -36,9 +36,9 @@ def run_pt( model=model, args=training_args, finetuning_args=finetuning_args, - tokenizer=tokenizer, data_collator=data_collator, callbacks=callbacks, + **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) diff --git a/src/llmtuner/train/rm/trainer.py b/src/llmtuner/train/rm/trainer.py index 0f5d88d3..d49dd67b 100644 --- a/src/llmtuner/train/rm/trainer.py +++ b/src/llmtuner/train/rm/trainer.py @@ -11,7 +11,7 @@ from ..utils import create_custom_optimzer, create_custom_scheduler if TYPE_CHECKING: - from transformers.modeling_utils import PreTrainedModel + from transformers import PreTrainedModel, ProcessorMixin from transformers.trainer import PredictionOutput from ...hparams import FinetuningArguments @@ -25,9 +25,12 @@ class PairwiseTrainer(Trainer): Inherits Trainer to compute pairwise loss. """ - def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None: + def __init__( + self, finetuning_args: "FinetuningArguments", processor: Optional["ProcessorMixin"], **kwargs + ) -> None: super().__init__(**kwargs) self.finetuning_args = finetuning_args + self.processor = processor self.can_return_loss = True # override property to return eval_loss if finetuning_args.use_badam: from badam import clip_grad_norm_for_sparse_tensor @@ -45,6 +48,12 @@ class PairwiseTrainer(Trainer): create_custom_scheduler(self.args, num_training_steps, optimizer) return super().create_scheduler(num_training_steps, optimizer) + def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None: + super()._save(output_dir, state_dict) + if self.processor is not None: + output_dir = output_dir if output_dir is not None else self.args.output_dir + getattr(self.processor, "image_processor").save_pretrained(output_dir) + def compute_loss( self, model: "PreTrainedModel", inputs: Dict[str, torch.Tensor], return_outputs: bool = False ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]: diff --git a/src/llmtuner/train/rm/workflow.py b/src/llmtuner/train/rm/workflow.py index bd0a756c..621d03b7 100644 --- a/src/llmtuner/train/rm/workflow.py +++ b/src/llmtuner/train/rm/workflow.py @@ -39,10 +39,10 @@ def run_rm( model=model, args=training_args, finetuning_args=finetuning_args, - tokenizer=tokenizer, data_collator=data_collator, callbacks=callbacks + [FixValueHeadModelCallback()], compute_metrics=compute_accuracy, + **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py index 59515a62..168abd86 100644 --- a/src/llmtuner/webui/runner.py +++ b/src/llmtuner/webui/runner.py @@ -107,6 +107,7 @@ class Runner: model_name_or_path=get("top.model_path"), adapter_name_or_path=adapter_name_or_path, cache_dir=user_config.get("cache_dir", None), + preprocessing_num_workers=16, finetuning_type=get("top.finetuning_type"), quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None, template=get("top.template"), @@ -141,6 +142,7 @@ class Runner: fp16=(get("train.compute_type") == "fp16"), bf16=(get("train.compute_type") == "bf16"), pure_bf16=(get("train.compute_type") == "pure_bf16"), + plot_loss=True, ) if args["finetuning_type"] == "freeze": @@ -214,6 +216,7 @@ class Runner: model_name_or_path=get("top.model_path"), adapter_name_or_path=adapter_name_or_path, cache_dir=user_config.get("cache_dir", None), + preprocessing_num_workers=16, finetuning_type=get("top.finetuning_type"), quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None, template=get("top.template"), diff --git a/src/llmtuner/webui/utils.py b/src/llmtuner/webui/utils.py index 1f2b0591..3d34f0d2 100644 --- a/src/llmtuner/webui/utils.py +++ b/src/llmtuner/webui/utils.py @@ -42,7 +42,6 @@ def clean_cmd(args: Dict[str, Any]) -> Dict[str, Any]: def gen_cmd(args: Dict[str, Any]) -> str: - args["plot_loss"] = args.get("do_train", None) current_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "0") cmd_lines = ["CUDA_VISIBLE_DEVICES={} llamafactory-cli train ".format(current_devices)] for k, v in clean_cmd(args).items():