fix #3694
This commit is contained in:
parent
44cfa9a1cd
commit
2a67ab3925
|
@ -5,8 +5,8 @@ model_name_or_path: models/llama3-8b-instruct-pro
|
||||||
stage: sft
|
stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: freeze
|
finetuning_type: freeze
|
||||||
name_module_trainable: all
|
freeze_trainable_layers: 8
|
||||||
num_layer_trainable: 8
|
freeze_trainable_modules: all
|
||||||
use_llama_pro: true
|
use_llama_pro: true
|
||||||
|
|
||||||
# dataset
|
# dataset
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
# Performs block expansion for LLaMA, Mistral or Qwen1.5 models.
|
# Performs block expansion for LLaMA, Mistral, Qwen1.5 or Yi models.
|
||||||
# Usage: python llama_pro.py --model_name_or_path meta-llama/Llama-2-7b-hf --output_dir llama2_pro --num_expand 8
|
# Usage: python llama_pro.py --model_name_or_path meta-llama/Llama-2-7b-hf --output_dir llama2_pro --num_expand 8
|
||||||
# Inspired by: https://github.com/TencentARC/LLaMA-Pro/blob/main/scripts/block_expansion.py
|
# Inspired by: https://github.com/TencentARC/LLaMA-Pro/blob/main/scripts/block_expansion.py
|
||||||
|
|
||||||
|
@ -106,8 +106,7 @@ def block_expansion(
|
||||||
print("Fine-tune this model with:")
|
print("Fine-tune this model with:")
|
||||||
print(" --model_name_or_path {} \\".format(output_dir))
|
print(" --model_name_or_path {} \\".format(output_dir))
|
||||||
print(" --finetuning_type freeze \\")
|
print(" --finetuning_type freeze \\")
|
||||||
print(" --name_module_trainable all \\")
|
print(" --freeze_trainable_layers {} \\".format(num_expand))
|
||||||
print(" --num_layer_trainable {} \\".format(num_expand))
|
|
||||||
print(" --use_llama_pro")
|
print(" --use_llama_pro")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import json
|
from dataclasses import dataclass, field
|
||||||
from dataclasses import asdict, dataclass, field
|
|
||||||
from typing import Literal, Optional
|
from typing import Literal, Optional
|
||||||
|
|
||||||
|
|
||||||
|
@ -9,22 +8,40 @@ class FreezeArguments:
|
||||||
Arguments pertaining to the freeze (partial-parameter) training.
|
Arguments pertaining to the freeze (partial-parameter) training.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name_module_trainable: str = field(
|
freeze_trainable_layers: int = field(
|
||||||
default="all",
|
default=2,
|
||||||
metadata={
|
metadata={
|
||||||
"help": """Name of trainable modules for partial-parameter (freeze) fine-tuning. \
|
"help": (
|
||||||
Use commas to separate multiple modules. \
|
"The number of trainable layers for freeze (partial-parameter) fine-tuning. "
|
||||||
Use "all" to specify all the available modules. \
|
"Positive numbers mean the last n layers are set as trainable, "
|
||||||
LLaMA choices: ["mlp", "self_attn"], \
|
"negative numbers mean the first n layers are set as trainable."
|
||||||
BLOOM & Falcon & ChatGLM choices: ["mlp", "self_attention"], \
|
)
|
||||||
Qwen choices: ["mlp", "attn"], \
|
|
||||||
InternLM2 choices: ["feed_forward", "attention"], \
|
|
||||||
Others choices: the same as LLaMA."""
|
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
num_layer_trainable: int = field(
|
freeze_trainable_modules: str = field(
|
||||||
default=2,
|
default="all",
|
||||||
metadata={"help": "The number of trainable layers for partial-parameter (freeze) fine-tuning."},
|
metadata={
|
||||||
|
"help": (
|
||||||
|
"Name(s) of trainable modules for freeze (partial-parameter) fine-tuning. "
|
||||||
|
"Use commas to separate multiple modules. "
|
||||||
|
"Use `all` to specify all the available modules. "
|
||||||
|
"LLaMA choices: [`mlp`, `self_attn`], "
|
||||||
|
"BLOOM & Falcon & ChatGLM choices: [`mlp`, `self_attention`], "
|
||||||
|
"Qwen choices: [`mlp`, `attn`], "
|
||||||
|
"InternLM2 choices: [`feed_forward`, `attention`], "
|
||||||
|
"Others choices: the same as LLaMA."
|
||||||
|
)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
freeze_extra_modules: Optional[str] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={
|
||||||
|
"help": (
|
||||||
|
"Name(s) of modules apart from hidden layers to be set as trainable "
|
||||||
|
"for freeze (partial-parameter) fine-tuning. "
|
||||||
|
"Use commas to separate multiple modules."
|
||||||
|
)
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -37,7 +54,11 @@ class LoraArguments:
|
||||||
additional_target: Optional[str] = field(
|
additional_target: Optional[str] = field(
|
||||||
default=None,
|
default=None,
|
||||||
metadata={
|
metadata={
|
||||||
"help": "Name(s) of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint."
|
"help": (
|
||||||
|
"Name(s) of modules apart from LoRA layers to be set as trainable "
|
||||||
|
"and saved in the final checkpoint. "
|
||||||
|
"Use commas to separate multiple modules."
|
||||||
|
)
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
lora_alpha: Optional[int] = field(
|
lora_alpha: Optional[int] = field(
|
||||||
|
@ -55,15 +76,17 @@ class LoraArguments:
|
||||||
lora_target: str = field(
|
lora_target: str = field(
|
||||||
default="all",
|
default="all",
|
||||||
metadata={
|
metadata={
|
||||||
"help": """Name(s) of target modules to apply LoRA. \
|
"help": (
|
||||||
Use commas to separate multiple modules. \
|
"Name(s) of target modules to apply LoRA. "
|
||||||
Use "all" to specify all the linear modules. \
|
"Use commas to separate multiple modules. "
|
||||||
LLaMA choices: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], \
|
"Use `all` to specify all the linear modules. "
|
||||||
BLOOM & Falcon & ChatGLM choices: ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], \
|
"LLaMA choices: [`q_proj`, `k_proj`, `v_proj`, `o_proj`, `gate_proj`, `up_proj`, `down_proj`], "
|
||||||
Baichuan choices: ["W_pack", "o_proj", "gate_proj", "up_proj", "down_proj"], \
|
"BLOOM & Falcon & ChatGLM choices: [`query_key_value`, `dense`, `dense_h_to_4h`, `dense_4h_to_h`], "
|
||||||
Qwen choices: ["c_attn", "attn.c_proj", "w1", "w2", "mlp.c_proj"], \
|
"Baichuan choices: [`W_pack`, `o_proj`, `gate_proj`, `up_proj`, `down_proj`], "
|
||||||
InternLM2 choices: ["wqkv", "wo", "w1", "w2", "w3"], \
|
"Qwen choices: [`c_attn`, `attn.c_proj`, `w1`, `w2`, `mlp.c_proj`], "
|
||||||
Others choices: the same as LLaMA."""
|
"InternLM2 choices: [`wqkv`, `wo`, `w1`, `w2`, `w3`], "
|
||||||
|
"Others choices: the same as LLaMA."
|
||||||
|
)
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
loraplus_lr_ratio: Optional[float] = field(
|
loraplus_lr_ratio: Optional[float] = field(
|
||||||
|
@ -177,8 +200,10 @@ class GaloreArguments:
|
||||||
galore_target: str = field(
|
galore_target: str = field(
|
||||||
default="all",
|
default="all",
|
||||||
metadata={
|
metadata={
|
||||||
"help": """Name(s) of modules to apply GaLore. Use commas to separate multiple modules. \
|
"help": (
|
||||||
Use "all" to specify all the linear modules."""
|
"Name(s) of modules to apply GaLore. Use commas to separate multiple modules. "
|
||||||
|
"Use `all` to specify all the linear modules."
|
||||||
|
)
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
galore_rank: int = field(
|
galore_rank: int = field(
|
||||||
|
@ -238,16 +263,20 @@ class BAdamArgument:
|
||||||
badam_mask_mode: Literal["adjacent", "scatter"] = field(
|
badam_mask_mode: Literal["adjacent", "scatter"] = field(
|
||||||
default="adjacent",
|
default="adjacent",
|
||||||
metadata={
|
metadata={
|
||||||
"help": """The mode of the mask for BAdam optimizer. \
|
"help": (
|
||||||
`adjacent` means that the trainable parameters are adjacent to each other, \
|
"The mode of the mask for BAdam optimizer. "
|
||||||
`scatter` means that trainable parameters are randomly choosed from the weight."""
|
"`adjacent` means that the trainable parameters are adjacent to each other, "
|
||||||
|
"`scatter` means that trainable parameters are randomly choosed from the weight."
|
||||||
|
)
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
badam_verbose: int = field(
|
badam_verbose: int = field(
|
||||||
default=0,
|
default=0,
|
||||||
metadata={
|
metadata={
|
||||||
"help": """The verbosity level of BAdam optimizer. \
|
"help": (
|
||||||
0 for no print, 1 for print the block prefix, 2 for print trainable parameters"""
|
"The verbosity level of BAdam optimizer. "
|
||||||
|
"0 for no print, 1 for print the block prefix, 2 for print trainable parameters."
|
||||||
|
)
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -285,7 +314,8 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
|
||||||
return [item.strip() for item in arg.split(",")]
|
return [item.strip() for item in arg.split(",")]
|
||||||
return arg
|
return arg
|
||||||
|
|
||||||
self.name_module_trainable = split_arg(self.name_module_trainable)
|
self.freeze_trainable_modules = split_arg(self.freeze_trainable_modules)
|
||||||
|
self.freeze_extra_modules = split_arg(self.freeze_extra_modules)
|
||||||
self.lora_alpha = self.lora_alpha or self.lora_rank * 2
|
self.lora_alpha = self.lora_alpha or self.lora_rank * 2
|
||||||
self.lora_target = split_arg(self.lora_target)
|
self.lora_target = split_arg(self.lora_target)
|
||||||
self.additional_target = split_arg(self.additional_target)
|
self.additional_target = split_arg(self.additional_target)
|
||||||
|
@ -315,17 +345,3 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA
|
||||||
|
|
||||||
if self.loraplus_lr_ratio is not None and self.finetuning_type != "lora":
|
if self.loraplus_lr_ratio is not None and self.finetuning_type != "lora":
|
||||||
raise ValueError("`loraplus_lr_ratio` is only valid for the LoRA training.")
|
raise ValueError("`loraplus_lr_ratio` is only valid for the LoRA training.")
|
||||||
|
|
||||||
def save_to_json(self, json_path: str):
|
|
||||||
r"""Saves the content of this instance in JSON format inside `json_path`."""
|
|
||||||
json_string = json.dumps(asdict(self), indent=2, sort_keys=True) + "\n"
|
|
||||||
with open(json_path, "w", encoding="utf-8") as f:
|
|
||||||
f.write(json_string)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load_from_json(cls, json_path: str):
|
|
||||||
r"""Creates an instance from the content of `json_path`."""
|
|
||||||
with open(json_path, "r", encoding="utf-8") as f:
|
|
||||||
text = f.read()
|
|
||||||
|
|
||||||
return cls(**json.loads(text))
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import re
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
@ -68,37 +69,52 @@ def init_adapter(
|
||||||
raise ValueError("Current model does not support freeze tuning.")
|
raise ValueError("Current model does not support freeze tuning.")
|
||||||
|
|
||||||
if finetuning_args.use_llama_pro:
|
if finetuning_args.use_llama_pro:
|
||||||
if num_layers % finetuning_args.num_layer_trainable != 0:
|
if num_layers % finetuning_args.freeze_trainable_layers != 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"`num_layers` {} should be divisible by `num_layer_trainable` {}.".format(
|
"`num_layers` {} should be divisible by `num_layer_trainable` {}.".format(
|
||||||
num_layers, finetuning_args.num_layer_trainable
|
num_layers, finetuning_args.freeze_trainable_layers
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
stride = num_layers // finetuning_args.num_layer_trainable
|
stride = num_layers // finetuning_args.freeze_trainable_layers
|
||||||
trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride)
|
trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride)
|
||||||
elif finetuning_args.num_layer_trainable > 0: # fine-tuning the last n layers if num_layer_trainable > 0
|
elif finetuning_args.freeze_trainable_layers > 0: # fine-tuning the last n layers if num_layer_trainable > 0
|
||||||
trainable_layer_ids = range(num_layers - finetuning_args.num_layer_trainable, num_layers)
|
trainable_layer_ids = range(max(0, num_layers - finetuning_args.freeze_trainable_layers), num_layers)
|
||||||
else: # fine-tuning the first n layers if num_layer_trainable < 0
|
else: # fine-tuning the first n layers if num_layer_trainable < 0
|
||||||
trainable_layer_ids = range(-finetuning_args.num_layer_trainable)
|
trainable_layer_ids = range(min(-finetuning_args.freeze_trainable_layers, num_layers))
|
||||||
|
|
||||||
freeze_modules = {"all"}
|
hidden_modules = set()
|
||||||
for name, _ in model.named_modules():
|
non_hidden_modules = set()
|
||||||
|
for name, _ in model.named_parameters():
|
||||||
if ".0." in name:
|
if ".0." in name:
|
||||||
freeze_modules.add(name.split(".0.")[-1].split(".")[0])
|
hidden_modules.add(name.split(".0.")[-1].split(".")[0])
|
||||||
elif ".1." in name: # MoD starts from layer 1
|
elif ".1." in name: # MoD starts from layer 1
|
||||||
freeze_modules.add(name.split(".1.")[-1].split(".")[0])
|
hidden_modules.add(name.split(".1.")[-1].split(".")[0])
|
||||||
|
|
||||||
|
if re.search(r"\.\d+\.", name) is None:
|
||||||
|
non_hidden_modules.add(name.split(".")[-2])
|
||||||
|
|
||||||
trainable_layers = []
|
trainable_layers = []
|
||||||
for module_name in finetuning_args.name_module_trainable:
|
for module_name in finetuning_args.freeze_trainable_modules:
|
||||||
if module_name not in freeze_modules:
|
if module_name != "all" and module_name not in hidden_modules:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Module {} is not found, please choose from {}".format(module_name, ", ".join(freeze_modules))
|
"Module {} is not found, please choose from {}".format(module_name, ", ".join(hidden_modules))
|
||||||
)
|
)
|
||||||
|
|
||||||
for idx in trainable_layer_ids:
|
for idx in trainable_layer_ids:
|
||||||
trainable_layers.append(".{:d}.{}".format(idx, module_name if module_name != "all" else ""))
|
trainable_layers.append(".{:d}.{}".format(idx, module_name if module_name != "all" else ""))
|
||||||
|
|
||||||
|
if finetuning_args.freeze_extra_modules:
|
||||||
|
for module_name in finetuning_args.freeze_extra_modules:
|
||||||
|
if module_name not in non_hidden_modules:
|
||||||
|
raise ValueError(
|
||||||
|
"Module {} is not found, please choose from {}".format(
|
||||||
|
module_name, ", ".join(non_hidden_modules)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
trainable_layers.append(module_name)
|
||||||
|
|
||||||
for name, param in model.named_parameters():
|
for name, param in model.named_parameters():
|
||||||
if any(trainable_layer in name for trainable_layer in trainable_layers):
|
if any(trainable_layer in name for trainable_layer in trainable_layers):
|
||||||
if cast_trainable_params_to_fp32:
|
if cast_trainable_params_to_fp32:
|
||||||
|
|
|
@ -124,13 +124,17 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
|
||||||
|
|
||||||
with gr.Accordion(open=False) as freeze_tab:
|
with gr.Accordion(open=False) as freeze_tab:
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
num_layer_trainable = gr.Slider(minimum=1, maximum=128, value=2, step=1)
|
freeze_trainable_layers = gr.Slider(minimum=-128, maximum=128, value=2, step=1)
|
||||||
name_module_trainable = gr.Textbox(value="all")
|
freeze_trainable_modules = gr.Textbox(value="all")
|
||||||
|
freeze_extra_modules = gr.Textbox()
|
||||||
|
|
||||||
input_elems.update({num_layer_trainable, name_module_trainable})
|
input_elems.update({freeze_trainable_layers, freeze_trainable_modules, freeze_extra_modules})
|
||||||
elem_dict.update(
|
elem_dict.update(
|
||||||
dict(
|
dict(
|
||||||
freeze_tab=freeze_tab, num_layer_trainable=num_layer_trainable, name_module_trainable=name_module_trainable
|
freeze_tab=freeze_tab,
|
||||||
|
freeze_trainable_layers=freeze_trainable_layers,
|
||||||
|
freeze_trainable_modules=freeze_trainable_modules,
|
||||||
|
freeze_extra_modules=freeze_extra_modules,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -572,24 +572,24 @@ LOCALES = {
|
||||||
"label": "部分参数微调设置",
|
"label": "部分参数微调设置",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"num_layer_trainable": {
|
"freeze_trainable_layers": {
|
||||||
"en": {
|
"en": {
|
||||||
"label": "Trainable layers",
|
"label": "Trainable layers",
|
||||||
"info": "The number of trainable layers.",
|
"info": "Number of the last(+)/first(-) hidden layers to be set as trainable.",
|
||||||
},
|
},
|
||||||
"ru": {
|
"ru": {
|
||||||
"label": "Обучаемые слои",
|
"label": "Обучаемые слои",
|
||||||
"info": "Количество обучаемых слоев.",
|
"info": "Количество последних (+)/первых (-) скрытых слоев, которые будут установлены как обучаемые.",
|
||||||
},
|
},
|
||||||
"zh": {
|
"zh": {
|
||||||
"label": "可训练层数",
|
"label": "可训练层数",
|
||||||
"info": "可训练模型层的数量。",
|
"info": "最末尾(+)/最前端(-)可训练隐藏层的数量。",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"name_module_trainable": {
|
"freeze_trainable_modules": {
|
||||||
"en": {
|
"en": {
|
||||||
"label": "Trainable modules",
|
"label": "Trainable modules",
|
||||||
"info": "The name of trainable modules. Use commas to separate multiple modules.",
|
"info": "Name(s) of trainable modules. Use commas to separate multiple modules.",
|
||||||
},
|
},
|
||||||
"ru": {
|
"ru": {
|
||||||
"label": "Обучаемые модули",
|
"label": "Обучаемые модули",
|
||||||
|
@ -600,6 +600,26 @@ LOCALES = {
|
||||||
"info": "可训练模块的名称。使用英文逗号分隔多个名称。",
|
"info": "可训练模块的名称。使用英文逗号分隔多个名称。",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
"freeze_extra_modules": {
|
||||||
|
"en": {
|
||||||
|
"label": "Extra modules (optional)",
|
||||||
|
"info": (
|
||||||
|
"Name(s) of modules apart from hidden layers to be set as trainable. "
|
||||||
|
"Use commas to separate multiple modules."
|
||||||
|
),
|
||||||
|
},
|
||||||
|
"ru": {
|
||||||
|
"label": "Дополнительные модули (опционально)",
|
||||||
|
"info": (
|
||||||
|
"Имена модулей, кроме скрытых слоев, которые следует установить в качестве обучаемых. "
|
||||||
|
"Используйте запятые для разделения нескольких модулей."
|
||||||
|
),
|
||||||
|
},
|
||||||
|
"zh": {
|
||||||
|
"label": "额外模块(非必填)",
|
||||||
|
"info": "除隐藏层以外的可训练模块名称。使用英文逗号分隔多个名称。",
|
||||||
|
},
|
||||||
|
},
|
||||||
"lora_tab": {
|
"lora_tab": {
|
||||||
"en": {
|
"en": {
|
||||||
"label": "LoRA configurations",
|
"label": "LoRA configurations",
|
||||||
|
|
|
@ -146,8 +146,9 @@ class Runner:
|
||||||
)
|
)
|
||||||
|
|
||||||
if args["finetuning_type"] == "freeze":
|
if args["finetuning_type"] == "freeze":
|
||||||
args["num_layer_trainable"] = get("train.num_layer_trainable")
|
args["freeze_trainable_layers"] = get("train.freeze_trainable_layers")
|
||||||
args["name_module_trainable"] = get("train.name_module_trainable")
|
args["freeze_trainable_modules"] = get("train.freeze_trainable_modules")
|
||||||
|
args["freeze_extra_modules"] = get("train.freeze_extra_modules") or None
|
||||||
elif args["finetuning_type"] == "lora":
|
elif args["finetuning_type"] == "lora":
|
||||||
args["lora_rank"] = get("train.lora_rank")
|
args["lora_rank"] = get("train.lora_rank")
|
||||||
args["lora_alpha"] = get("train.lora_alpha")
|
args["lora_alpha"] = get("train.lora_alpha")
|
||||||
|
|
Loading…
Reference in New Issue