2022-02-14 21:19:03 +08:00
|
|
|
from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func
|
|
|
|
from opendelta.utils.name_based_addressing import *
|
|
|
|
from opendelta.utils.cuda import get_device
|
|
|
|
from opendelta.basemodel import DeltaBase
|
|
|
|
from typing import *
|
|
|
|
import torch
|
|
|
|
import torch.nn as nn
|
|
|
|
from opendelta import BaseDeltaConfig
|
|
|
|
from opendelta import logging
|
|
|
|
logger = logging.get_logger(__name__)
|
|
|
|
|
|
|
|
class SoftPromptConfig(BaseDeltaConfig):
|
|
|
|
r"""
|
|
|
|
This is the configuration class to store the configuration of a :py:class:`SoftPromptModel`
|
|
|
|
|
|
|
|
"""
|
|
|
|
def __init__(
|
2022-04-14 11:22:41 +08:00
|
|
|
self,
|
2022-02-14 21:19:03 +08:00
|
|
|
soft_token_num=100,
|
|
|
|
init_range = 0.5,
|
|
|
|
token_init = True,
|
|
|
|
**kwargs
|
2022-04-14 11:22:41 +08:00
|
|
|
):
|
2022-02-14 21:19:03 +08:00
|
|
|
super().__init__(**kwargs)
|
|
|
|
arg_names = get_arg_names_inside_func(self.__init__)
|
|
|
|
for arg_name in arg_names:
|
|
|
|
if not hasattr(self, arg_name): # the arg has not been registered in parent config
|
|
|
|
setattr(self, arg_name, locals()[arg_name])
|
|
|
|
|
2022-04-14 11:22:41 +08:00
|
|
|
|
2022-02-14 21:19:03 +08:00
|
|
|
|
|
|
|
class SoftPromptLayer(nn.Module):
|
|
|
|
r"""This is the implementation of `The Power of Scale for Parameter-Efficient
|
|
|
|
Prompt Tuning <https://arxiv.org/pdf/2104.08691v1.pdf>`_ . Similar to :obj:`PrefixTuningTemplate`,
|
|
|
|
This template also does not need any textual template. Addition tokens are directly
|
2022-04-14 11:22:41 +08:00
|
|
|
concatenated into the input ids. There are two initializations of the new tokens.
|
|
|
|
(1). random initialization. (2) initialize with the tokens of the plm (We simply take
|
2022-02-14 21:19:03 +08:00
|
|
|
the first n_tokens similar to their implementation).
|
|
|
|
|
|
|
|
Note that this template can be simply achieved by :obj:`SoftManualTemplate`, in which
|
|
|
|
you set ``n_token`` <soft> tokens template before the <text_a> will give the same result.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self,
|
|
|
|
soft_token_num: int = 100,
|
|
|
|
raw_embedding: Optional[torch.Tensor] = None,
|
|
|
|
init_range: Optional[float] = 0.5,
|
2022-04-18 23:28:13 +08:00
|
|
|
other_expand_ids: Optional[Dict] = {"attention_mask":1, "token_type_ids":0},
|
2022-02-14 21:19:03 +08:00
|
|
|
token_init = False,
|
|
|
|
pad_id = 0,
|
|
|
|
device: Optional[str]=None,
|
|
|
|
):
|
|
|
|
super().__init__()
|
|
|
|
self.__dict__['raw_embedding'] = raw_embedding
|
2022-04-14 11:22:41 +08:00
|
|
|
|
2022-02-14 21:19:03 +08:00
|
|
|
self.init_range = init_range
|
|
|
|
self.num_tokens = soft_token_num
|
|
|
|
self.pad_id = pad_id
|
|
|
|
self.token_init = token_init
|
|
|
|
self.device = device
|
2022-04-18 23:28:13 +08:00
|
|
|
self.other_expand_ids = other_expand_ids
|
2022-04-14 11:22:41 +08:00
|
|
|
|
2022-02-14 21:19:03 +08:00
|
|
|
assert self.num_tokens>0
|
|
|
|
self.instantiate(raw_embedding(torch.tensor([0])).shape[-1])
|
|
|
|
|
2022-04-22 19:18:25 +08:00
|
|
|
# self.all_pseudo_tokens = {}
|
2022-04-18 23:28:13 +08:00
|
|
|
|
2022-02-14 21:19:03 +08:00
|
|
|
def pre_forward(self, *args, **kwargs):
|
|
|
|
# if attention_mask is passed as PLM's input, modify it here
|
2022-04-14 11:22:41 +08:00
|
|
|
if 'encoder_outputs' in kwargs and kwargs['encoder_outputs'] is not None:
|
2022-02-14 21:19:03 +08:00
|
|
|
# In generation, the input is forward through the model again.
|
|
|
|
return args, kwargs
|
|
|
|
|
|
|
|
if 'input_ids' in kwargs:
|
|
|
|
input_ids = kwargs['input_ids']
|
|
|
|
kwargs['input_ids'] = None
|
|
|
|
elif len(args) > 0:
|
|
|
|
input_ids = args[0]
|
|
|
|
args = args[1:]
|
|
|
|
else:
|
|
|
|
input_ids = None
|
|
|
|
|
|
|
|
|
|
|
|
if 'attention_mask' not in kwargs or kwargs['attention_mask'] is None:
|
|
|
|
# infer attention mask
|
|
|
|
if input_ids is None:
|
|
|
|
raise RuntimeError("no input ids found")
|
|
|
|
kwargs['attention_mask'] = (input_ids != self.pad_id).to(torch.int64)
|
2022-04-14 11:22:41 +08:00
|
|
|
|
2022-02-14 21:19:03 +08:00
|
|
|
if 'inputs_embeds' not in kwargs or kwargs['inputs_embeds'] is None:
|
|
|
|
try:
|
|
|
|
inputs_embeds = self.raw_embedding(input_ids)
|
|
|
|
except:
|
|
|
|
raise RuntimeError("neither inputs_embeds nor input_ids is specified.")
|
|
|
|
else:
|
|
|
|
inputs_embeds = kwargs['inputs_embeds']
|
2022-04-14 11:22:41 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
2022-02-14 21:19:03 +08:00
|
|
|
batch_size = inputs_embeds.size(0)
|
|
|
|
soft_embeds = self.soft_embeds.repeat(batch_size, 1, 1)
|
|
|
|
inputs_embeds = torch.cat([soft_embeds, inputs_embeds], 1)
|
|
|
|
kwargs['inputs_embeds'] = inputs_embeds
|
|
|
|
|
2022-04-18 23:28:13 +08:00
|
|
|
for expand_key in self.other_expand_ids:
|
|
|
|
if expand_key in kwargs:
|
|
|
|
real_tokens = kwargs[expand_key]
|
2022-04-22 19:18:25 +08:00
|
|
|
# if expand_key in self.all_pseudo_tokens:
|
|
|
|
# pseudo_tokens = self.all_pseudo_tokens[expand_key].to(real_tokens.device)
|
|
|
|
# else:
|
|
|
|
pseudo_tokens_value = self.other_expand_ids[expand_key]
|
|
|
|
pseudo_tokens = torch.ones(
|
|
|
|
(*real_tokens.shape[:-1], inputs_embeds.shape[-2]-real_tokens.shape[-1]),
|
|
|
|
dtype = real_tokens.dtype,
|
|
|
|
device=real_tokens.device) * pseudo_tokens_value
|
|
|
|
# self.all_pseudo_tokens[expand_key] = pseudo_tokens
|
2022-04-18 23:28:13 +08:00
|
|
|
real_tokens.data = torch.cat([pseudo_tokens, real_tokens], dim=-1)
|
2022-02-14 21:19:03 +08:00
|
|
|
|
|
|
|
return args, kwargs
|
|
|
|
|
|
|
|
def instantiate(self, hidden_dim) -> None:
|
|
|
|
"""
|
|
|
|
generate parameters needed for soft tokens embedding in soft-prompt
|
|
|
|
for soft tokens, use a new embedding layer which is initialized with their corresponding embedding of hard tokens
|
|
|
|
"""
|
|
|
|
soft_embeds = torch.FloatTensor(self.num_tokens, hidden_dim)
|
|
|
|
if self.token_init:
|
|
|
|
soft_embeds.data = torch.clone(self.raw_embedding(torch.tensor([i for i in range(self.num_tokens)])))
|
|
|
|
else:
|
|
|
|
soft_embeds = soft_embeds.uniform_(-self.init_range, self.init_range)
|
|
|
|
|
|
|
|
self.soft_embeds = nn.Parameter(soft_embeds, requires_grad=True).to(self.device)
|
|
|
|
|
|
|
|
|
|
|
|
class SoftPromptModel(DeltaBase):
|
|
|
|
r"""
|
|
|
|
This is the implementation of `The Power of Scale for Parameter-Efficient
|
|
|
|
Prompt Tuning <https://arxiv.org/pdf/2104.08691v1.pdf>`_ . Similar to :obj:`PrefixTuningTemplate`,
|
|
|
|
This template also does not need any textual template. Addition tokens are directly
|
2022-04-14 11:22:41 +08:00
|
|
|
concatenated into the input ids. There are two initializations of the new tokens.
|
|
|
|
(1). random initialization. (2) initialize with the tokens of the plm (We simply take
|
2022-02-14 21:19:03 +08:00
|
|
|
the first n_tokens similar to their implementation).
|
|
|
|
|
|
|
|
Note that this template can be simply achieved by :obj:`SoftManualTemplate`, in which
|
|
|
|
you set ``n_token`` <soft> tokens template before the <text_a> will give the same result.
|
|
|
|
|
|
|
|
Args:
|
2022-04-14 11:22:41 +08:00
|
|
|
backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified.
|
2022-02-14 21:19:03 +08:00
|
|
|
soft_token_num (:obj:`int`, *optional*): num of new tokens to add in the front of the input.
|
2022-03-19 15:04:42 +08:00
|
|
|
init_range (:obj:`float`, *optional*): If initialize new tokens randomly, the random range of uniform distribution.
|
2022-02-14 21:19:03 +08:00
|
|
|
token_init (:obj:`bool`, *optional*, default to :obj:`True`): Whether to initialize the new tokens with tokens of the plm
|
2022-04-18 23:28:13 +08:00
|
|
|
other_expand_ids (:obj:`dict`, *optional*, default to `{"attention_mask":1, "token_type_ids":0}`) The name of
|
|
|
|
other tokens and its default value that expand along with the input sequence. For example, when
|
|
|
|
you prepend 100 tokens to the input_ids, the attention_mask should be extended, and the token_type_ids should
|
|
|
|
be extended as well.
|
2022-02-14 21:19:03 +08:00
|
|
|
modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only
|
|
|
|
the implemented ones)
|
|
|
|
unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen
|
|
|
|
together with the prefix parameters.
|
2022-03-19 15:04:42 +08:00
|
|
|
common_structure (:obj:`bool`): whether using name-based addressing with a common structure mapping.
|
2022-02-14 21:19:03 +08:00
|
|
|
|
|
|
|
"""
|
|
|
|
config_class = SoftPromptConfig
|
|
|
|
delta_type = "soft_prompt"
|
|
|
|
default_modified_modules = ["root"] # not used
|
2022-04-14 11:22:41 +08:00
|
|
|
def __init__(self,
|
2022-02-14 21:19:03 +08:00
|
|
|
backbone_model: nn.Module,
|
|
|
|
soft_token_num=100,
|
|
|
|
init_range = 0.5,
|
|
|
|
token_init=True,
|
2022-04-18 23:28:13 +08:00
|
|
|
other_expand_ids={"attention_mask":1, "token_type_ids":0},
|
2022-03-19 15:04:42 +08:00
|
|
|
modified_modules: Optional[List[str]] = None,
|
2022-04-14 11:22:41 +08:00
|
|
|
exclude_modules: Optional[List[str]] = None,
|
2022-03-19 15:04:42 +08:00
|
|
|
unfrozen_modules: Optional[List[str]] = None,
|
2022-02-14 21:19:03 +08:00
|
|
|
common_structure: Optional[bool] = None,
|
|
|
|
interactive_modify: Optional[Union[bool, int]] = False,
|
|
|
|
):
|
2022-04-14 11:22:41 +08:00
|
|
|
DeltaBase.__init__(self,
|
2022-02-14 21:19:03 +08:00
|
|
|
backbone_model = backbone_model,
|
|
|
|
modified_modules = ["root"],
|
2022-04-14 11:22:41 +08:00
|
|
|
exclude_modules = exclude_modules,
|
2022-02-14 21:19:03 +08:00
|
|
|
unfrozen_modules = unfrozen_modules,
|
|
|
|
common_structure = False,
|
|
|
|
interactive_modify = interactive_modify,
|
|
|
|
)
|
|
|
|
|
|
|
|
arg_names = get_arg_names_inside_func(self.__init__)
|
|
|
|
for arg_name in arg_names:
|
|
|
|
if not hasattr(self, arg_name): # not registered in parent class
|
|
|
|
setattr(self, arg_name, locals()[arg_name])
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
self.__dict__['raw_embedding'] = self.backbone_model.get_input_embeddings()
|
|
|
|
except AttributeError:
|
|
|
|
raise AttributeError(f"'{type(self.backbone_model)}' object has no attribute 'get_input_embeddings', please pass "+
|
|
|
|
"input embeddings into 'self.raw_embedding' in user-specific ways.")
|
2022-04-14 11:22:41 +08:00
|
|
|
|
2022-02-14 21:19:03 +08:00
|
|
|
self.delta_modules = nn.ModuleList()
|
|
|
|
self.add_all_delta_to_backbone(self.backbone_model,
|
|
|
|
self.modified_modules,
|
|
|
|
)
|
|
|
|
|
2022-04-14 11:22:41 +08:00
|
|
|
def add_all_delta_to_backbone(self,
|
|
|
|
module: nn.Module,
|
2022-02-14 21:19:03 +08:00
|
|
|
modified_modules: List[str],
|
|
|
|
) -> nn.Module:
|
|
|
|
self.update_module()
|
|
|
|
self.mark_as_delta()
|
|
|
|
return module
|
2022-04-14 11:22:41 +08:00
|
|
|
|
2022-02-14 21:19:03 +08:00
|
|
|
def update_module(self):
|
|
|
|
soft_prompt_layer = self.new_module_like(self.raw_embedding)
|
|
|
|
self.insert_sequential_module(self.backbone_model.get_encoder() if self.backbone_model.config.is_encoder_decoder else self.backbone_model,
|
2022-04-14 11:22:41 +08:00
|
|
|
delta_module=soft_prompt_layer,
|
2022-02-20 17:23:31 +08:00
|
|
|
delta_name="soft_prompt_layer" )
|
2022-02-14 21:19:03 +08:00
|
|
|
|
|
|
|
def new_module_like(self, module):
|
|
|
|
module_device = get_device(module)
|
2022-02-20 17:23:31 +08:00
|
|
|
soft_prompt_layer = SoftPromptLayer(
|
2022-02-14 21:19:03 +08:00
|
|
|
soft_token_num = self.soft_token_num,
|
|
|
|
raw_embedding = self.raw_embedding,
|
2022-04-18 23:28:13 +08:00
|
|
|
other_expand_ids = self.other_expand_ids,
|
2022-02-14 21:19:03 +08:00
|
|
|
token_init = self.token_init,
|
|
|
|
init_range = self.init_range,
|
|
|
|
device = module_device,
|
|
|
|
)
|
2022-04-14 11:22:41 +08:00
|
|
|
self.delta_modules.append(soft_prompt_layer)
|
2022-02-14 21:19:03 +08:00
|
|
|
return soft_prompt_layer
|