diff --git a/examples/extras/llama_pro/llama3_freeze_sft.yaml b/examples/extras/llama_pro/llama3_freeze_sft.yaml index 4d92cdad..0ffcb5e8 100644 --- a/examples/extras/llama_pro/llama3_freeze_sft.yaml +++ b/examples/extras/llama_pro/llama3_freeze_sft.yaml @@ -5,8 +5,8 @@ model_name_or_path: models/llama3-8b-instruct-pro stage: sft do_train: true finetuning_type: freeze -name_module_trainable: all -num_layer_trainable: 8 +freeze_trainable_layers: 8 +freeze_trainable_modules: all use_llama_pro: true # dataset diff --git a/scripts/llama_pro.py b/scripts/llama_pro.py index 8a4294a2..997b3496 100644 --- a/scripts/llama_pro.py +++ b/scripts/llama_pro.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Performs block expansion for LLaMA, Mistral or Qwen1.5 models. +# Performs block expansion for LLaMA, Mistral, Qwen1.5 or Yi models. # Usage: python llama_pro.py --model_name_or_path meta-llama/Llama-2-7b-hf --output_dir llama2_pro --num_expand 8 # Inspired by: https://github.com/TencentARC/LLaMA-Pro/blob/main/scripts/block_expansion.py @@ -106,8 +106,7 @@ def block_expansion( print("Fine-tune this model with:") print(" --model_name_or_path {} \\".format(output_dir)) print(" --finetuning_type freeze \\") - print(" --name_module_trainable all \\") - print(" --num_layer_trainable {} \\".format(num_expand)) + print(" --freeze_trainable_layers {} \\".format(num_expand)) print(" --use_llama_pro") diff --git a/src/llmtuner/hparams/finetuning_args.py b/src/llmtuner/hparams/finetuning_args.py index 03bf52af..e728c30a 100644 --- a/src/llmtuner/hparams/finetuning_args.py +++ b/src/llmtuner/hparams/finetuning_args.py @@ -1,5 +1,4 @@ -import json -from dataclasses import asdict, dataclass, field +from dataclasses import dataclass, field from typing import Literal, Optional @@ -9,22 +8,40 @@ class FreezeArguments: Arguments pertaining to the freeze (partial-parameter) training. """ - name_module_trainable: str = field( - default="all", + freeze_trainable_layers: int = field( + default=2, metadata={ - "help": """Name of trainable modules for partial-parameter (freeze) fine-tuning. \ - Use commas to separate multiple modules. \ - Use "all" to specify all the available modules. \ - LLaMA choices: ["mlp", "self_attn"], \ - BLOOM & Falcon & ChatGLM choices: ["mlp", "self_attention"], \ - Qwen choices: ["mlp", "attn"], \ - InternLM2 choices: ["feed_forward", "attention"], \ - Others choices: the same as LLaMA.""" + "help": ( + "The number of trainable layers for freeze (partial-parameter) fine-tuning. " + "Positive numbers mean the last n layers are set as trainable, " + "negative numbers mean the first n layers are set as trainable." + ) }, ) - num_layer_trainable: int = field( - default=2, - metadata={"help": "The number of trainable layers for partial-parameter (freeze) fine-tuning."}, + freeze_trainable_modules: str = field( + default="all", + metadata={ + "help": ( + "Name(s) of trainable modules for freeze (partial-parameter) fine-tuning. " + "Use commas to separate multiple modules. " + "Use `all` to specify all the available modules. " + "LLaMA choices: [`mlp`, `self_attn`], " + "BLOOM & Falcon & ChatGLM choices: [`mlp`, `self_attention`], " + "Qwen choices: [`mlp`, `attn`], " + "InternLM2 choices: [`feed_forward`, `attention`], " + "Others choices: the same as LLaMA." + ) + }, + ) + freeze_extra_modules: Optional[str] = field( + default=None, + metadata={ + "help": ( + "Name(s) of modules apart from hidden layers to be set as trainable " + "for freeze (partial-parameter) fine-tuning. " + "Use commas to separate multiple modules." + ) + }, ) @@ -37,7 +54,11 @@ class LoraArguments: additional_target: Optional[str] = field( default=None, metadata={ - "help": "Name(s) of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint." + "help": ( + "Name(s) of modules apart from LoRA layers to be set as trainable " + "and saved in the final checkpoint. " + "Use commas to separate multiple modules." + ) }, ) lora_alpha: Optional[int] = field( @@ -55,15 +76,17 @@ class LoraArguments: lora_target: str = field( default="all", metadata={ - "help": """Name(s) of target modules to apply LoRA. \ - Use commas to separate multiple modules. \ - Use "all" to specify all the linear modules. \ - LLaMA choices: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], \ - BLOOM & Falcon & ChatGLM choices: ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], \ - Baichuan choices: ["W_pack", "o_proj", "gate_proj", "up_proj", "down_proj"], \ - Qwen choices: ["c_attn", "attn.c_proj", "w1", "w2", "mlp.c_proj"], \ - InternLM2 choices: ["wqkv", "wo", "w1", "w2", "w3"], \ - Others choices: the same as LLaMA.""" + "help": ( + "Name(s) of target modules to apply LoRA. " + "Use commas to separate multiple modules. " + "Use `all` to specify all the linear modules. " + "LLaMA choices: [`q_proj`, `k_proj`, `v_proj`, `o_proj`, `gate_proj`, `up_proj`, `down_proj`], " + "BLOOM & Falcon & ChatGLM choices: [`query_key_value`, `dense`, `dense_h_to_4h`, `dense_4h_to_h`], " + "Baichuan choices: [`W_pack`, `o_proj`, `gate_proj`, `up_proj`, `down_proj`], " + "Qwen choices: [`c_attn`, `attn.c_proj`, `w1`, `w2`, `mlp.c_proj`], " + "InternLM2 choices: [`wqkv`, `wo`, `w1`, `w2`, `w3`], " + "Others choices: the same as LLaMA." + ) }, ) loraplus_lr_ratio: Optional[float] = field( @@ -177,8 +200,10 @@ class GaloreArguments: galore_target: str = field( default="all", metadata={ - "help": """Name(s) of modules to apply GaLore. Use commas to separate multiple modules. \ - Use "all" to specify all the linear modules.""" + "help": ( + "Name(s) of modules to apply GaLore. Use commas to separate multiple modules. " + "Use `all` to specify all the linear modules." + ) }, ) galore_rank: int = field( @@ -238,16 +263,20 @@ class BAdamArgument: badam_mask_mode: Literal["adjacent", "scatter"] = field( default="adjacent", metadata={ - "help": """The mode of the mask for BAdam optimizer. \ - `adjacent` means that the trainable parameters are adjacent to each other, \ - `scatter` means that trainable parameters are randomly choosed from the weight.""" + "help": ( + "The mode of the mask for BAdam optimizer. " + "`adjacent` means that the trainable parameters are adjacent to each other, " + "`scatter` means that trainable parameters are randomly choosed from the weight." + ) }, ) badam_verbose: int = field( default=0, metadata={ - "help": """The verbosity level of BAdam optimizer. \ - 0 for no print, 1 for print the block prefix, 2 for print trainable parameters""" + "help": ( + "The verbosity level of BAdam optimizer. " + "0 for no print, 1 for print the block prefix, 2 for print trainable parameters." + ) }, ) @@ -285,7 +314,8 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA return [item.strip() for item in arg.split(",")] return arg - self.name_module_trainable = split_arg(self.name_module_trainable) + self.freeze_trainable_modules = split_arg(self.freeze_trainable_modules) + self.freeze_extra_modules = split_arg(self.freeze_extra_modules) self.lora_alpha = self.lora_alpha or self.lora_rank * 2 self.lora_target = split_arg(self.lora_target) self.additional_target = split_arg(self.additional_target) @@ -315,17 +345,3 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA if self.loraplus_lr_ratio is not None and self.finetuning_type != "lora": raise ValueError("`loraplus_lr_ratio` is only valid for the LoRA training.") - - def save_to_json(self, json_path: str): - r"""Saves the content of this instance in JSON format inside `json_path`.""" - json_string = json.dumps(asdict(self), indent=2, sort_keys=True) + "\n" - with open(json_path, "w", encoding="utf-8") as f: - f.write(json_string) - - @classmethod - def load_from_json(cls, json_path: str): - r"""Creates an instance from the content of `json_path`.""" - with open(json_path, "r", encoding="utf-8") as f: - text = f.read() - - return cls(**json.loads(text)) diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py index 83f9a2d2..4ae95a62 100644 --- a/src/llmtuner/model/adapter.py +++ b/src/llmtuner/model/adapter.py @@ -1,3 +1,4 @@ +import re from typing import TYPE_CHECKING import torch @@ -68,37 +69,52 @@ def init_adapter( raise ValueError("Current model does not support freeze tuning.") if finetuning_args.use_llama_pro: - if num_layers % finetuning_args.num_layer_trainable != 0: + if num_layers % finetuning_args.freeze_trainable_layers != 0: raise ValueError( "`num_layers` {} should be divisible by `num_layer_trainable` {}.".format( - num_layers, finetuning_args.num_layer_trainable + num_layers, finetuning_args.freeze_trainable_layers ) ) - stride = num_layers // finetuning_args.num_layer_trainable + stride = num_layers // finetuning_args.freeze_trainable_layers trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride) - elif finetuning_args.num_layer_trainable > 0: # fine-tuning the last n layers if num_layer_trainable > 0 - trainable_layer_ids = range(num_layers - finetuning_args.num_layer_trainable, num_layers) + elif finetuning_args.freeze_trainable_layers > 0: # fine-tuning the last n layers if num_layer_trainable > 0 + trainable_layer_ids = range(max(0, num_layers - finetuning_args.freeze_trainable_layers), num_layers) else: # fine-tuning the first n layers if num_layer_trainable < 0 - trainable_layer_ids = range(-finetuning_args.num_layer_trainable) + trainable_layer_ids = range(min(-finetuning_args.freeze_trainable_layers, num_layers)) - freeze_modules = {"all"} - for name, _ in model.named_modules(): + hidden_modules = set() + non_hidden_modules = set() + for name, _ in model.named_parameters(): if ".0." in name: - freeze_modules.add(name.split(".0.")[-1].split(".")[0]) + hidden_modules.add(name.split(".0.")[-1].split(".")[0]) elif ".1." in name: # MoD starts from layer 1 - freeze_modules.add(name.split(".1.")[-1].split(".")[0]) + hidden_modules.add(name.split(".1.")[-1].split(".")[0]) + + if re.search(r"\.\d+\.", name) is None: + non_hidden_modules.add(name.split(".")[-2]) trainable_layers = [] - for module_name in finetuning_args.name_module_trainable: - if module_name not in freeze_modules: + for module_name in finetuning_args.freeze_trainable_modules: + if module_name != "all" and module_name not in hidden_modules: raise ValueError( - "Module {} is not found, please choose from {}".format(module_name, ", ".join(freeze_modules)) + "Module {} is not found, please choose from {}".format(module_name, ", ".join(hidden_modules)) ) for idx in trainable_layer_ids: trainable_layers.append(".{:d}.{}".format(idx, module_name if module_name != "all" else "")) + if finetuning_args.freeze_extra_modules: + for module_name in finetuning_args.freeze_extra_modules: + if module_name not in non_hidden_modules: + raise ValueError( + "Module {} is not found, please choose from {}".format( + module_name, ", ".join(non_hidden_modules) + ) + ) + + trainable_layers.append(module_name) + for name, param in model.named_parameters(): if any(trainable_layer in name for trainable_layer in trainable_layers): if cast_trainable_params_to_fp32: diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py index 5cde660c..be853604 100644 --- a/src/llmtuner/webui/components/train.py +++ b/src/llmtuner/webui/components/train.py @@ -124,13 +124,17 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]: with gr.Accordion(open=False) as freeze_tab: with gr.Row(): - num_layer_trainable = gr.Slider(minimum=1, maximum=128, value=2, step=1) - name_module_trainable = gr.Textbox(value="all") + freeze_trainable_layers = gr.Slider(minimum=-128, maximum=128, value=2, step=1) + freeze_trainable_modules = gr.Textbox(value="all") + freeze_extra_modules = gr.Textbox() - input_elems.update({num_layer_trainable, name_module_trainable}) + input_elems.update({freeze_trainable_layers, freeze_trainable_modules, freeze_extra_modules}) elem_dict.update( dict( - freeze_tab=freeze_tab, num_layer_trainable=num_layer_trainable, name_module_trainable=name_module_trainable + freeze_tab=freeze_tab, + freeze_trainable_layers=freeze_trainable_layers, + freeze_trainable_modules=freeze_trainable_modules, + freeze_extra_modules=freeze_extra_modules, ) ) diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py index 5bf925b7..7afe6ec3 100644 --- a/src/llmtuner/webui/locales.py +++ b/src/llmtuner/webui/locales.py @@ -572,24 +572,24 @@ LOCALES = { "label": "部分参数微调设置", }, }, - "num_layer_trainable": { + "freeze_trainable_layers": { "en": { "label": "Trainable layers", - "info": "The number of trainable layers.", + "info": "Number of the last(+)/first(-) hidden layers to be set as trainable.", }, "ru": { "label": "Обучаемые слои", - "info": "Количество обучаемых слоев.", + "info": "Количество последних (+)/первых (-) скрытых слоев, которые будут установлены как обучаемые.", }, "zh": { "label": "可训练层数", - "info": "可训练模型层的数量。", + "info": "最末尾(+)/最前端(-)可训练隐藏层的数量。", }, }, - "name_module_trainable": { + "freeze_trainable_modules": { "en": { "label": "Trainable modules", - "info": "The name of trainable modules. Use commas to separate multiple modules.", + "info": "Name(s) of trainable modules. Use commas to separate multiple modules.", }, "ru": { "label": "Обучаемые модули", @@ -600,6 +600,26 @@ LOCALES = { "info": "可训练模块的名称。使用英文逗号分隔多个名称。", }, }, + "freeze_extra_modules": { + "en": { + "label": "Extra modules (optional)", + "info": ( + "Name(s) of modules apart from hidden layers to be set as trainable. " + "Use commas to separate multiple modules." + ), + }, + "ru": { + "label": "Дополнительные модули (опционально)", + "info": ( + "Имена модулей, кроме скрытых слоев, которые следует установить в качестве обучаемых. " + "Используйте запятые для разделения нескольких модулей." + ), + }, + "zh": { + "label": "额外模块(非必填)", + "info": "除隐藏层以外的可训练模块名称。使用英文逗号分隔多个名称。", + }, + }, "lora_tab": { "en": { "label": "LoRA configurations", diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py index 168abd86..ef911a16 100644 --- a/src/llmtuner/webui/runner.py +++ b/src/llmtuner/webui/runner.py @@ -146,8 +146,9 @@ class Runner: ) if args["finetuning_type"] == "freeze": - args["num_layer_trainable"] = get("train.num_layer_trainable") - args["name_module_trainable"] = get("train.name_module_trainable") + args["freeze_trainable_layers"] = get("train.freeze_trainable_layers") + args["freeze_trainable_modules"] = get("train.freeze_trainable_modules") + args["freeze_extra_modules"] = get("train.freeze_extra_modules") or None elif args["finetuning_type"] == "lora": args["lora_rank"] = get("train.lora_rank") args["lora_alpha"] = get("train.lora_alpha")