From 29a6d5bdb8610be8f796eed65eede9ba7b503527 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Tue, 21 May 2024 08:57:14 +0800 Subject: [PATCH 1/4] support pretraining of llava --- data/dataset_info.json | 14 ++++ data/mllm_pt_demo.json | 92 ++++++++++++++++++++++++++ src/llamafactory/hparams/model_args.py | 4 ++ src/llamafactory/model/loader.py | 5 ++ 4 files changed, 115 insertions(+) create mode 100644 data/mllm_pt_demo.json diff --git a/data/dataset_info.json b/data/dataset_info.json index b985582e..5a90e077 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -38,6 +38,20 @@ "assistant_tag": "assistant" } }, + "mllm_pt_demo": { + "file_name": "mllm_pt_demo.json", + "formatting": "sharegpt", + "columns": { + "messages": "messages", + "images": "images" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant" + } + }, "alpaca_en": { "hf_hub_url": "llamafactory/alpaca_en", "ms_hub_url": "llamafactory/alpaca_en" diff --git a/data/mllm_pt_demo.json b/data/mllm_pt_demo.json new file mode 100644 index 00000000..2ee01ce6 --- /dev/null +++ b/data/mllm_pt_demo.json @@ -0,0 +1,92 @@ +[ + { + "messages": [ + { + "content": "Render a clear and concise summary of the photo.", + "role": "user" + }, + { + "content": "There are two soccer players on the field.", + "role": "assistant" + } + ], + "images": [ + "mllm_demo_data/1.jpg" + ] + }, + { + "messages": [ + { + "content": "Write a terse but informative summary of the picture.", + "role": "user" + }, + { + "content": "A soccer player is sliding on his knees to celebrate", + "role": "assistant" + } + ], + "images": [ + "mllm_demo_data/2.jpg" + ] + }, + { + "messages": [ + { + "content": "What is this?", + "role": "user" + }, + { + "content": "A man is giving a speech.", + "role": "assistant" + } + ], + "images": [ + "mllm_demo_data/3.jpg" + ] + }, + { + "messages": [ + { + "content": "对照片进行简明扼要的概括。", + "role": "user" + }, + { + "content": "两个足球运动员在场上", + "role": "assistant" + } + ], + "images": [ + "mllm_demo_data/1.jpg" + ] + }, + { + "messages": [ + { + "content": "为图片写一个简短但内容丰富的摘要。", + "role": "user" + }, + { + "content": "一个足球运动员在跪地滑行庆祝", + "role": "assistant" + } + ], + "images": [ + "mllm_demo_data/2.jpg" + ] + }, + { + "messages": [ + { + "content": "这是什么?", + "role": "user" + }, + { + "content": "一个男人在演讲", + "role": "assistant" + } + ], + "images": [ + "mllm_demo_data/3.jpg" + ] + } +] \ No newline at end of file diff --git a/src/llamafactory/hparams/model_args.py b/src/llamafactory/hparams/model_args.py index 5885bb09..255051dc 100644 --- a/src/llamafactory/hparams/model_args.py +++ b/src/llamafactory/hparams/model_args.py @@ -85,6 +85,10 @@ class ModelArguments: default=False, metadata={"help": "Whethor or not to use multimodal LLM that accepts visual inputs."}, ) + tune_mm_proj: bool = field( + default=False, + metadata={"help": "Whethor or not only finetune mm_projector for MLLM."}, + ) moe_aux_loss_coef: Optional[float] = field( default=None, metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."}, diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index 49b347d5..d9784593 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -163,6 +163,11 @@ def load_model( else: model.train() + if model_args.visual_inputs and model_args.tune_mm_proj: + lm_params = [param for name, param in model.named_parameters() if "language_model" in name] + for param in lm_params: + param.requires_grad_(False) + trainable_params, all_param = count_parameters(model) if is_trainable: param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( From 57eb13b75d8597d748e84d3549a0b08876b669db Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Mon, 27 May 2024 18:59:00 +0800 Subject: [PATCH 2/4] add regex of only tune lm and mm_proj --- sites/paligemma-pt.yaml | 49 ++++++++++++++++++++++++++ sites/paligemma.yaml | 49 ++++++++++++++++++++++++++ sites/paligemma_lora.yaml | 40 +++++++++++++++++++++ src/llamafactory/model/adapter.py | 7 ++++ src/llamafactory/model/loader.py | 5 --- src/llamafactory/model/utils/visual.py | 7 +++- 6 files changed, 151 insertions(+), 6 deletions(-) create mode 100644 sites/paligemma-pt.yaml create mode 100644 sites/paligemma.yaml create mode 100644 sites/paligemma_lora.yaml diff --git a/sites/paligemma-pt.yaml b/sites/paligemma-pt.yaml new file mode 100644 index 00000000..4305cf5f --- /dev/null +++ b/sites/paligemma-pt.yaml @@ -0,0 +1,49 @@ +# model +model_name_or_path: google/paligemma-3b-mix-448 +visual_inputs: true +tune_mm_proj: true +#print_param_status: true + +# method +stage: sft +do_train: true +finetuning_type: full + +# ddp +ddp_timeout: 180000000 +deepspeed: examples/deepspeed/ds_z2_offload_config.json + +# dataset +dataset: mllm_pt_demo +dataset_dir: data +template: gemma +cutoff_len: 2048 +max_samples: 3 +#val_size: 0.0001 +overwrite_cache: true +preprocessing_num_workers: 16 + +# output +output_dir: saves/paligemma/full/sft_llava_pt_test +logging_steps: 1 +save_steps: 50 +plot_loss: true +overwrite_output_dir: true +#save_strategy: epoch +#save_total_limit: 2 + +# train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 16 +learning_rate: 0.00001 +num_train_epochs: 100 +lr_scheduler_type: cosine +warmup_steps: 0.1 +#bf16: true +pure_bf16: true + +# eval +do_eval: false +#per_device_eval_batch_size: 1 +#evaluation_strategy: steps +#eval_steps: 500 diff --git a/sites/paligemma.yaml b/sites/paligemma.yaml new file mode 100644 index 00000000..f3257cfc --- /dev/null +++ b/sites/paligemma.yaml @@ -0,0 +1,49 @@ +# model +model_name_or_path: google/paligemma-3b-mix-448 +visual_inputs: true +#print_param_status: true +use_fast_tokenizer: false + +# method +stage: sft +do_train: true +finetuning_type: full + +# ddp +ddp_timeout: 180000000 +deepspeed: examples/deepspeed/ds_z2_offload_config.json + +# dataset +dataset: mllm_demo +dataset_dir: data +template: gemma +cutoff_len: 2048 +max_samples: 3 +#val_size: 0.0001 +overwrite_cache: true +preprocessing_num_workers: 16 + +# output +output_dir: saves/paligemma/full/sft_llava_1k +logging_steps: 1 +save_steps: 50 +plot_loss: true +overwrite_output_dir: true +#save_strategy: epoch +#save_total_limit: 2 + +# train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 16 +learning_rate: 0.00001 +num_train_epochs: 100 +lr_scheduler_type: cosine +warmup_steps: 0.1 +#bf16: true +pure_bf16: true + +# eval +do_eval: false +#per_device_eval_batch_size: 1 +#evaluation_strategy: steps +#eval_steps: 500 diff --git a/sites/paligemma_lora.yaml b/sites/paligemma_lora.yaml new file mode 100644 index 00000000..0693a6ae --- /dev/null +++ b/sites/paligemma_lora.yaml @@ -0,0 +1,40 @@ +### model +model_name_or_path: google/paligemma-3b-mix-448 +visual_inputs: true +use_fast_tokenizer: false + +### method +stage: sft +do_train: true +finetuning_type: lora +lora_target: q_proj,v_proj + +### dataset +dataset: mllm_demo +template: gemma +cutoff_len: 1024 +max_samples: 1000 +overwrite_cache: true +preprocessing_num_workers: 16 + +### output +output_dir: saves/paligemma/lora/sft_mllm +logging_steps: 10 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true + +### train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 8 +learning_rate: 0.0001 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_steps: 0.1 +fp16: true + +### eval +val_size: 0.1 +per_device_eval_batch_size: 1 +evaluation_strategy: steps +eval_steps: 500 diff --git a/src/llamafactory/model/adapter.py b/src/llamafactory/model/adapter.py index f37f3bbb..015db8a0 100644 --- a/src/llamafactory/model/adapter.py +++ b/src/llamafactory/model/adapter.py @@ -10,6 +10,7 @@ from ..extras.logging import get_logger from .utils.misc import find_all_linear_modules, find_expanded_modules from .utils.quantization import QuantizationMethod from .utils.unsloth import get_unsloth_peft_model, load_unsloth_peft_model +from .utils.visual import filter_vision_tower_linear if TYPE_CHECKING: @@ -58,6 +59,9 @@ def init_adapter( if model_args.visual_inputs and hasattr(model, "vision_tower"): # freeze vision model model.vision_tower.requires_grad_(False) + if model_args.visual_inputs and hasattr(model, "language_model") and model_args.tune_mm_proj: # freeze language model if only tune mm_proj + model.language_model.requires_grad_(False) + if finetuning_args.finetuning_type == "freeze" and is_trainable: logger.info("Fine-tuning method: Freeze") num_layers = ( @@ -180,6 +184,9 @@ def init_adapter( if finetuning_args.use_llama_pro: target_modules = find_expanded_modules(model, target_modules, finetuning_args.num_layer_trainable) + if model_args.visual_inputs: + target_modules = filter_vision_tower_linear(target_modules) + if ( finetuning_args.use_dora and getattr(model, "quantization_method", None) is not None diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py index d9784593..49b347d5 100644 --- a/src/llamafactory/model/loader.py +++ b/src/llamafactory/model/loader.py @@ -163,11 +163,6 @@ def load_model( else: model.train() - if model_args.visual_inputs and model_args.tune_mm_proj: - lm_params = [param for name, param in model.named_parameters() if "language_model" in name] - for param in lm_params: - param.requires_grad_(False) - trainable_params, all_param = count_parameters(model) if is_trainable: param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( diff --git a/src/llamafactory/model/utils/visual.py b/src/llamafactory/model/utils/visual.py index c8260b7f..a91777ba 100644 --- a/src/llamafactory/model/utils/visual.py +++ b/src/llamafactory/model/utils/visual.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Tuple +from typing import TYPE_CHECKING, Tuple, List import torch import transformers.models @@ -82,3 +82,8 @@ def configure_visual_model(config: "PretrainedConfig") -> None: if getattr(config, "is_yi_vl_derived_model", None): logger.info("Detected Yi-VL model, applying projector patch.") transformers.models.llava.modeling_llava.LlavaMultiModalProjector = LlavaMultiModalProjectorForYiVL + + +def filter_vision_tower_linear(target_modules: List[str]) -> str: + target_modules = f"^(?!.*vision_tower).*(?:{'|'.join(target_modules)}).*" + return target_modules From 7ae9a4726cb99abb4c80f6b4b37590dc04ea6660 Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Mon, 27 May 2024 19:00:15 +0800 Subject: [PATCH 3/4] add only tune lm and mm_proj --- sites/paligemma-pt.yaml | 49 --------------------------------------- sites/paligemma.yaml | 49 --------------------------------------- sites/paligemma_lora.yaml | 40 -------------------------------- 3 files changed, 138 deletions(-) delete mode 100644 sites/paligemma-pt.yaml delete mode 100644 sites/paligemma.yaml delete mode 100644 sites/paligemma_lora.yaml diff --git a/sites/paligemma-pt.yaml b/sites/paligemma-pt.yaml deleted file mode 100644 index 4305cf5f..00000000 --- a/sites/paligemma-pt.yaml +++ /dev/null @@ -1,49 +0,0 @@ -# model -model_name_or_path: google/paligemma-3b-mix-448 -visual_inputs: true -tune_mm_proj: true -#print_param_status: true - -# method -stage: sft -do_train: true -finetuning_type: full - -# ddp -ddp_timeout: 180000000 -deepspeed: examples/deepspeed/ds_z2_offload_config.json - -# dataset -dataset: mllm_pt_demo -dataset_dir: data -template: gemma -cutoff_len: 2048 -max_samples: 3 -#val_size: 0.0001 -overwrite_cache: true -preprocessing_num_workers: 16 - -# output -output_dir: saves/paligemma/full/sft_llava_pt_test -logging_steps: 1 -save_steps: 50 -plot_loss: true -overwrite_output_dir: true -#save_strategy: epoch -#save_total_limit: 2 - -# train -per_device_train_batch_size: 1 -gradient_accumulation_steps: 16 -learning_rate: 0.00001 -num_train_epochs: 100 -lr_scheduler_type: cosine -warmup_steps: 0.1 -#bf16: true -pure_bf16: true - -# eval -do_eval: false -#per_device_eval_batch_size: 1 -#evaluation_strategy: steps -#eval_steps: 500 diff --git a/sites/paligemma.yaml b/sites/paligemma.yaml deleted file mode 100644 index f3257cfc..00000000 --- a/sites/paligemma.yaml +++ /dev/null @@ -1,49 +0,0 @@ -# model -model_name_or_path: google/paligemma-3b-mix-448 -visual_inputs: true -#print_param_status: true -use_fast_tokenizer: false - -# method -stage: sft -do_train: true -finetuning_type: full - -# ddp -ddp_timeout: 180000000 -deepspeed: examples/deepspeed/ds_z2_offload_config.json - -# dataset -dataset: mllm_demo -dataset_dir: data -template: gemma -cutoff_len: 2048 -max_samples: 3 -#val_size: 0.0001 -overwrite_cache: true -preprocessing_num_workers: 16 - -# output -output_dir: saves/paligemma/full/sft_llava_1k -logging_steps: 1 -save_steps: 50 -plot_loss: true -overwrite_output_dir: true -#save_strategy: epoch -#save_total_limit: 2 - -# train -per_device_train_batch_size: 1 -gradient_accumulation_steps: 16 -learning_rate: 0.00001 -num_train_epochs: 100 -lr_scheduler_type: cosine -warmup_steps: 0.1 -#bf16: true -pure_bf16: true - -# eval -do_eval: false -#per_device_eval_batch_size: 1 -#evaluation_strategy: steps -#eval_steps: 500 diff --git a/sites/paligemma_lora.yaml b/sites/paligemma_lora.yaml deleted file mode 100644 index 0693a6ae..00000000 --- a/sites/paligemma_lora.yaml +++ /dev/null @@ -1,40 +0,0 @@ -### model -model_name_or_path: google/paligemma-3b-mix-448 -visual_inputs: true -use_fast_tokenizer: false - -### method -stage: sft -do_train: true -finetuning_type: lora -lora_target: q_proj,v_proj - -### dataset -dataset: mllm_demo -template: gemma -cutoff_len: 1024 -max_samples: 1000 -overwrite_cache: true -preprocessing_num_workers: 16 - -### output -output_dir: saves/paligemma/lora/sft_mllm -logging_steps: 10 -save_steps: 500 -plot_loss: true -overwrite_output_dir: true - -### train -per_device_train_batch_size: 1 -gradient_accumulation_steps: 8 -learning_rate: 0.0001 -num_train_epochs: 3.0 -lr_scheduler_type: cosine -warmup_steps: 0.1 -fp16: true - -### eval -val_size: 0.1 -per_device_eval_batch_size: 1 -evaluation_strategy: steps -eval_steps: 500 From f665342a2752ffb5d715f134603d84e5228f55dc Mon Sep 17 00:00:00 2001 From: BUAADreamer <1428195643@qq.com> Date: Mon, 27 May 2024 20:10:31 +0800 Subject: [PATCH 4/4] remove mllm_pt_demo.json --- data/dataset_info.json | 2 +- data/mllm_pt_demo.json | 92 ------------------------------------------ 2 files changed, 1 insertion(+), 93 deletions(-) delete mode 100644 data/mllm_pt_demo.json diff --git a/data/dataset_info.json b/data/dataset_info.json index 5a90e077..1deb3d6d 100644 --- a/data/dataset_info.json +++ b/data/dataset_info.json @@ -39,7 +39,7 @@ } }, "mllm_pt_demo": { - "file_name": "mllm_pt_demo.json", + "hf_hub_url": "BUAADreamer/mllm_pt_demo", "formatting": "sharegpt", "columns": { "messages": "messages", diff --git a/data/mllm_pt_demo.json b/data/mllm_pt_demo.json deleted file mode 100644 index 2ee01ce6..00000000 --- a/data/mllm_pt_demo.json +++ /dev/null @@ -1,92 +0,0 @@ -[ - { - "messages": [ - { - "content": "Render a clear and concise summary of the photo.", - "role": "user" - }, - { - "content": "There are two soccer players on the field.", - "role": "assistant" - } - ], - "images": [ - "mllm_demo_data/1.jpg" - ] - }, - { - "messages": [ - { - "content": "Write a terse but informative summary of the picture.", - "role": "user" - }, - { - "content": "A soccer player is sliding on his knees to celebrate", - "role": "assistant" - } - ], - "images": [ - "mllm_demo_data/2.jpg" - ] - }, - { - "messages": [ - { - "content": "What is this?", - "role": "user" - }, - { - "content": "A man is giving a speech.", - "role": "assistant" - } - ], - "images": [ - "mllm_demo_data/3.jpg" - ] - }, - { - "messages": [ - { - "content": "对照片进行简明扼要的概括。", - "role": "user" - }, - { - "content": "两个足球运动员在场上", - "role": "assistant" - } - ], - "images": [ - "mllm_demo_data/1.jpg" - ] - }, - { - "messages": [ - { - "content": "为图片写一个简短但内容丰富的摘要。", - "role": "user" - }, - { - "content": "一个足球运动员在跪地滑行庆祝", - "role": "assistant" - } - ], - "images": [ - "mllm_demo_data/2.jpg" - ] - }, - { - "messages": [ - { - "content": "这是什么?", - "role": "user" - }, - { - "content": "一个男人在演讲", - "role": "assistant" - } - ], - "images": [ - "mllm_demo_data/3.jpg" - ] - } -] \ No newline at end of file