diff --git a/src/llmtuner/hparams/model_args.py b/src/llmtuner/hparams/model_args.py index ac70bb3c..996eabae 100644 --- a/src/llmtuner/hparams/model_args.py +++ b/src/llmtuner/hparams/model_args.py @@ -85,6 +85,10 @@ class ModelArguments: default=False, metadata={"help": "Whethor or not to use multimodal LLM that accepts visual inputs."}, ) + autocast_projector: bool = field( + default=True, + metadata={"help": "Whethor or not to autocast projector."}, + ) moe_aux_loss_coef: Optional[float] = field( default=None, metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."}, diff --git a/src/llmtuner/model/loader.py b/src/llmtuner/model/loader.py index ead6178f..1dca84a1 100644 --- a/src/llmtuner/model/loader.py +++ b/src/llmtuner/model/loader.py @@ -155,7 +155,8 @@ def load_model( model.eval() else: model.train() - + if model_args.visual_inputs: + model.vision_tower.requires_grad_(False) trainable_params, all_param = count_parameters(model) if is_trainable: param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py index 31cba492..6ca6f2e5 100644 --- a/src/llmtuner/model/patcher.py +++ b/src/llmtuner/model/patcher.py @@ -101,7 +101,7 @@ def patch_model( if model_args.resize_vocab: resize_embedding_layer(model, tokenizer) - if model_args.visual_inputs: + if model_args.visual_inputs and model_args.autocast_projector: autocast_projector_dtype(model, model_args) if is_trainable: