OpenDeltaMirror/opendelta/delta_models/bitfit.py

from typing import Optional, Union
from opendelta.utils.signature import get_arg_names_inside_func
from opendelta.utils.name_based_addressing import *
from opendelta.basemodel import DeltaBase, is_leaf_module
from opendelta.utils.cuda import get_device, get_dtype
import torch.nn as nn

import torch
from torch.nn import init
import math
from opendelta import BaseDeltaConfig
import opendelta.utils.logging as logging
logger = logging.get_logger(__name__)


class BitFitConfig(BaseDeltaConfig):
    r"""
    This is the configuration class to store the configuration of a :py:class:`~BitFitModel`

    """
    def __init__(
        self,
        **kwargs
    ):
        super().__init__(**kwargs)
        arg_names = get_arg_names_inside_func(self.__init__)
        for arg_name in arg_names:
            if not hasattr(self, arg_name): # the arg has not been registered in parent config
                setattr(self, arg_name, locals()[arg_name])

class BiasLayer(nn.Module):
    def __init__(self, init_method="zero", dtype=None, device=None, backend=None):
        super().__init__()
        self.init_method=init_method
        self.instantiated = False
        self.dtype = dtype
        self.device = device
        self.backend = backend

    def instantiate(self, hidden_dim):
        if self.init_method == "zero":
            self.bias = nn.Parameter(torch.zeros(hidden_dim, dtype=self.dtype, device=self.device))
        else:
            raise NotImplementedError
        self.instantiated = True
        if self.backend == 'bmt':
            import bmtrain as bmt
            self.bias = bmt.BMTrainModelWrapper(self.bias)

    def post_forward(self, output):
        r"""Presuming the first argument is the tensor to add bias along the last dimension.
        In most cases, it is correct. However, be aware of the possibility that the presumption
        doesn't hold.
        """
        if isinstance(output, tuple):
            hiddens = output[0]
        elif isinstance(output, torch.Tensor):
            hiddens = output
        else:
            raise TypeError

        if not self.instantiated:
            self.hidden_dim = hiddens.shape[-1]
            logger.debug(f"Got hidden dim hidden_dim {self.hidden_dim}")
            self.instantiate(hidden_dim=self.hidden_dim)

        modified_output = hiddens + self.bias

        if isinstance(output, tuple):
            output = (modified_output,) + output[1:]
        elif isinstance(output, torch.Tensor):
            output = modified_output
        else:
            raise TypeError
        return output


class BitFitModel(DeltaBase):
    r""" The implementation of `BitFit: Simple Parameter-efficient Fine-tuning for Transformer-based Masked Language-models <https://arxiv.org/abs/2106.10199>`_ .
    Unfreeze bias term (or add bias term if bias term is absent in the backbone, e.g. T5) to the modules of
    a transformer block.

    .. note::

        **Broadcast to Submodule**: We modify all potential positions  of the specified
        ``modified_modules``. That is to say, if we specify ``attn`` in the modified_modules, then all position
        including the q, k, v and out linear layer of the attention layer are added bias layer (or unfreezing).
        The potential position is determined according to equation (1)-(5) and the previous three
        equations.


    class attributes:
        - default_modified_modules = ["attn", "ff", "layer_norm","lm_head.proj"] According to the paper and the
          implementation in `Compacter's baseline <https://github.com/rabeehk/compacter>`_ , we modify the
          bias term in the above modules.
        - delta_type = "bitfit"


    Args:
        backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified.
        modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only
                        the implemented ones)
        unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen
                         together with the prefix parameters.
        common_structure (:obj:`bool`): whether using name-based addressing with a common structure mapping.

    """


    config_class = BitFitConfig
    delta_type = "bitfit"
    default_modified_modules = ["attn@", "ff@", "layer_norm@","lm_head@.proj@"] # modify all the bias parameter in attention and feed-forward layer.
    _supported_backends = ['hf', 'bmt']
    _need_pseudo_data = False
    def __init__(self,
                 backbone_model: nn.Module,
                 modified_modules: Optional[List[str]] = None,
                 exclude_modules: Optional[List[str]] = None,
                 unfrozen_modules: Optional[List[str]] = None,
                 common_structure: Optional[bool] = None,
                 interactive_modify: Optional[Union[bool, int]] = False,
                 backend: Optional[str] = "hf",
                 ):
        DeltaBase.__init__(self,
                           backbone_model,
                           modified_modules=modified_modules,
                           exclude_modules=exclude_modules,
                           unfrozen_modules=unfrozen_modules,
                           common_structure=common_structure,
                           interactive_modify=interactive_modify,
                           backend=backend,
                           )
        arg_names = get_arg_names_inside_func(self.__init__)
        for arg_name in arg_names:
            if not hasattr(self, arg_name): # not registered in parent class
                setattr(self, arg_name, locals()[arg_name])

        self.delta_params = nn.ParameterList()
        self.delta_modules = nn.ModuleList()

        self.add_all_delta_to_backbone(self.backbone_model,
                                       self.modified_modules)
        
        
    def update_module(self, module: nn.Module, key: str):
        _, _, ref = self.find_module(module, key)
        self.modify_module(ref)


    def modify_module(self,
                      module: nn.Module,
                      ):
        if is_leaf_module(module):
            if self.backend_mapping.check_type(module, 'linear') or \
                self.backend_mapping.check_type(module, 'layer_norm'):
                self.add_bias_to_modules_have_bias_or_known_type(module)
            else:
                self.add_bias_to_others(module)
        else:
            for n, c in module.named_modules():
                self.add_bias_to_modules_have_bias_or_known_type(c)

    def add_bias_to_modules_have_bias_or_known_type(self, c):
        '''If it has bias, unfreeze it. 
        If it doesn't have bias: if it is Linear of LN, add to it, else pass.
        '''
        if 'bias' in [n for n,p in c.named_parameters()]:
            c.bias.requires_grad = True
            self.delta_params.append(c.bias)
        else:
            if self.backend_mapping.check_type(c, 'linear') or \
                self.backend_mapping.check_type(c, 'layer_norm'): 
                bias = nn.Parameter(torch.empty(c.out_features), requires_grad=True)
                
                self._reset_bias_parameters(c, bias) 
                if self.backend == 'bmt':
                    import bmtrain as bmt
                    bias = bmt.BMTrainModelWrapper(bias)
            
                c.register_parameter('bias', bias)
                self.delta_params.append(bias)

    def add_bias_to_others(self, c): 
        new_bias = BiasLayer(dtype=get_dtype(c), device=get_device(c), backend=self.backend)

        self.insert_sequential_module(c, delta_module=new_bias, delta_name="bitfit") # name shouldn't be `bias` here, since the name `bias` is reserved for some module such as roberta's LayerNorm.
        self.delta_modules.append(new_bias)

    @staticmethod
    def _reset_bias_parameters(linear_module, bias):
        fan_in, _ = init._calculate_fan_in_and_fan_out(linear_module.weight)
        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
        init.uniform_(bias, -bound, bound)
        # init.uniform_(bias, -bound, bound)

    def detach(self, module):
        r"""Not implemented for BitFit yet. Please wait for the next version.
        """
        raise NotImplementedError

    def attach(self, module):
        r"""Not implemented for BitFit yet. Please wait for the next version.
        """
        raise NotImplementedError
first commit 2022-02-14 21:19:03 +08:00			`from typing import Optional, Union`
			`from opendelta.utils.signature import get_arg_names_inside_func`
			`from opendelta.utils.name_based_addressing import *`
			`from opendelta.basemodel import DeltaBase, is_leaf_module`
init 2022-09-03 18:12:12 +08:00			`from opendelta.utils.cuda import get_device, get_dtype`
first commit 2022-02-14 21:19:03 +08:00			`import torch.nn as nn`

			`import torch`
			`from torch.nn import init`
			`import math`
			`from opendelta import BaseDeltaConfig`
			`import opendelta.utils.logging as logging`
			`logger = logging.get_logger(__name__)`


			`class BitFitConfig(BaseDeltaConfig):`
			`r"""`
			This is the configuration class to store the configuration of a :py:class:`~BitFitModel`

			`"""`
			`def __init__(`
search 2022-04-14 11:22:41 +08:00			`self,`
first commit 2022-02-14 21:19:03 +08:00			`**kwargs`
search 2022-04-14 11:22:41 +08:00			`):`
first commit 2022-02-14 21:19:03 +08:00			`super().__init__(**kwargs)`
			`arg_names = get_arg_names_inside_func(self.__init__)`
			`for arg_name in arg_names:`
			`if not hasattr(self, arg_name): # the arg has not been registered in parent config`
			`setattr(self, arg_name, locals()[arg_name])`

			`class BiasLayer(nn.Module):`
fix bitfit 2022-11-20 10:19:25 +08:00			`def __init__(self, init_method="zero", dtype=None, device=None, backend=None):`
first commit 2022-02-14 21:19:03 +08:00			`super().__init__()`
			`self.init_method=init_method`
			`self.instantiated = False`
init 2022-09-03 18:12:12 +08:00			`self.dtype = dtype`
			`self.device = device`
fix bitfit 2022-11-20 10:19:25 +08:00			`self.backend = backend`
first commit 2022-02-14 21:19:03 +08:00
			`def instantiate(self, hidden_dim):`
			`if self.init_method == "zero":`
init 2022-09-03 18:12:12 +08:00			`self.bias = nn.Parameter(torch.zeros(hidden_dim, dtype=self.dtype, device=self.device))`
first commit 2022-02-14 21:19:03 +08:00			`else:`
			`raise NotImplementedError`
			`self.instantiated = True`
fix bitfit 2022-11-20 10:19:25 +08:00			`if self.backend == 'bmt':`
init 2022-09-03 18:12:12 +08:00			`import bmtrain as bmt`
			`self.bias = bmt.BMTrainModelWrapper(self.bias)`
search 2022-04-14 11:22:41 +08:00
first commit 2022-02-14 21:19:03 +08:00			`def post_forward(self, output):`
			`r"""Presuming the first argument is the tensor to add bias along the last dimension.`
			`In most cases, it is correct. However, be aware of the possibility that the presumption`
search 2022-04-14 11:22:41 +08:00			`doesn't hold.`
first commit 2022-02-14 21:19:03 +08:00			`"""`
			`if isinstance(output, tuple):`
			`hiddens = output[0]`
			`elif isinstance(output, torch.Tensor):`
			`hiddens = output`
			`else:`
			`raise TypeError`
search 2022-04-14 11:22:41 +08:00
first commit 2022-02-14 21:19:03 +08:00			`if not self.instantiated:`
			`self.hidden_dim = hiddens.shape[-1]`
			`logger.debug(f"Got hidden dim hidden_dim {self.hidden_dim}")`
			`self.instantiate(hidden_dim=self.hidden_dim)`

			`modified_output = hiddens + self.bias`
search 2022-04-14 11:22:41 +08:00
first commit 2022-02-14 21:19:03 +08:00			`if isinstance(output, tuple):`
			`output = (modified_output,) + output[1:]`
			`elif isinstance(output, torch.Tensor):`
			`output = modified_output`
			`else:`
			`raise TypeError`
			`return output`



			`class BitFitModel(DeltaBase):`
			r""" The implementation of `BitFit: Simple Parameter-efficient Fine-tuning for Transformer-based Masked Language-models <https://arxiv.org/abs/2106.10199>`_ .
			`Unfreeze bias term (or add bias term if bias term is absent in the backbone, e.g. T5) to the modules of`
search 2022-04-14 11:22:41 +08:00			`a transformer block.`
first commit 2022-02-14 21:19:03 +08:00
search 2022-04-14 11:22:41 +08:00			`.. note::`
first commit 2022-02-14 21:19:03 +08:00
search 2022-04-14 11:22:41 +08:00			`Broadcast to Submodule: We modify all potential positions of the specified`
first commit 2022-02-14 21:19:03 +08:00			``modified_modules``. That is to say, if we specify ``attn`` in the modified_modules, then all position
			`including the q, k, v and out linear layer of the attention layer are added bias layer (or unfreezing).`
search 2022-04-14 11:22:41 +08:00			`The potential position is determined according to equation (1)-(5) and the previous three`
first commit 2022-02-14 21:19:03 +08:00			`equations.`
search 2022-04-14 11:22:41 +08:00
first commit 2022-02-14 21:19:03 +08:00
			`class attributes:`
search 2022-04-14 11:22:41 +08:00			`- default_modified_modules = ["attn", "ff", "layer_norm","lm_head.proj"] According to the paper and the`
first commit 2022-02-14 21:19:03 +08:00			implementation in `Compacter's baseline <https://github.com/rabeehk/compacter>`_ , we modify the
search 2022-04-14 11:22:41 +08:00			`bias term in the above modules.`
first commit 2022-02-14 21:19:03 +08:00			`- delta_type = "bitfit"`




			`Args:`
search 2022-04-14 11:22:41 +08:00			backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified.
first commit 2022-02-14 21:19:03 +08:00			modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only
			`the implemented ones)`
			unfrozen_modules (:obj:`List[str]`, optional, default to :obj:`None`): The modules that should be unfrozen
			`together with the prefix parameters.`
fix args type 2022-03-19 15:04:42 +08:00			common_structure (:obj:`bool`): whether using name-based addressing with a common structure mapping.
first commit 2022-02-14 21:19:03 +08:00
			`"""`


			`config_class = BitFitConfig`
			`delta_type = "bitfit"`
re-implement common structure and pseudo_inputs 2022-10-12 01:36:38 +08:00			`default_modified_modules = ["attn@", "ff@", "layer_norm@","lm_head@.proj@"] # modify all the bias parameter in attention and feed-forward layer.`
fix bitfit 2022-11-20 10:19:25 +08:00			`_supported_backends = ['hf', 'bmt']`
re-implement common structure and pseudo_inputs 2022-10-12 01:36:38 +08:00			`_need_pseudo_data = False`
first commit 2022-02-14 21:19:03 +08:00			`def __init__(self,`
search 2022-04-14 11:22:41 +08:00			`backbone_model: nn.Module,`
fix args type 2022-03-19 15:04:42 +08:00			`modified_modules: Optional[List[str]] = None,`
search 2022-04-14 11:22:41 +08:00			`exclude_modules: Optional[List[str]] = None,`
fix args type 2022-03-19 15:04:42 +08:00			`unfrozen_modules: Optional[List[str]] = None,`
first commit 2022-02-14 21:19:03 +08:00			`common_structure: Optional[bool] = None,`
			`interactive_modify: Optional[Union[bool, int]] = False,`
support bmtrain 2022-10-23 16:42:21 +08:00			`backend: Optional[str] = "hf",`
first commit 2022-02-14 21:19:03 +08:00			`):`
search 2022-04-14 11:22:41 +08:00			`DeltaBase.__init__(self,`
			`backbone_model,`
first commit 2022-02-14 21:19:03 +08:00			`modified_modules=modified_modules,`
search 2022-04-14 11:22:41 +08:00			`exclude_modules=exclude_modules,`
first commit 2022-02-14 21:19:03 +08:00			`unfrozen_modules=unfrozen_modules,`
			`common_structure=common_structure,`
			`interactive_modify=interactive_modify,`
support bmtrain 2022-10-23 16:42:21 +08:00			`backend=backend,`
first commit 2022-02-14 21:19:03 +08:00			`)`
			`arg_names = get_arg_names_inside_func(self.__init__)`
			`for arg_name in arg_names:`
			`if not hasattr(self, arg_name): # not registered in parent class`
			`setattr(self, arg_name, locals()[arg_name])`

			`self.delta_params = nn.ParameterList()`
			`self.delta_modules = nn.ModuleList()`

			`self.add_all_delta_to_backbone(self.backbone_model,`
temporary add 2022-10-20 18:16:05 +08:00			`self.modified_modules)`
support bmtrain 2022-10-23 16:42:21 +08:00

search 2022-04-14 11:22:41 +08:00

first commit 2022-02-14 21:19:03 +08:00			`def update_module(self, module: nn.Module, key: str):`
			`_, _, ref = self.find_module(module, key)`
			`self.modify_module(ref)`
search 2022-04-14 11:22:41 +08:00
first commit 2022-02-14 21:19:03 +08:00
			`def modify_module(self,`
search 2022-04-14 11:22:41 +08:00			`module: nn.Module,`
first commit 2022-02-14 21:19:03 +08:00			`):`
			`if is_leaf_module(module):`
support bmtrain 2022-10-23 16:42:21 +08:00			`if self.backend_mapping.check_type(module, 'linear') or \`
			`self.backend_mapping.check_type(module, 'layer_norm'):`
v0.3 updates 2022-10-14 23:15:38 +08:00			`self.add_bias_to_modules_have_bias_or_known_type(module)`
first commit 2022-02-14 21:19:03 +08:00			`else:`
			`self.add_bias_to_others(module)`
			`else:`
			`for n, c in module.named_modules():`
v0.3 updates 2022-10-14 23:15:38 +08:00			`self.add_bias_to_modules_have_bias_or_known_type(c)`

			`def add_bias_to_modules_have_bias_or_known_type(self, c):`
			`'''If it has bias, unfreeze it.`
			`If it doesn't have bias: if it is Linear of LN, add to it, else pass.`
			`'''`
			`if 'bias' in [n for n,p in c.named_parameters()]:`
first commit 2022-02-14 21:19:03 +08:00			`c.bias.requires_grad = True`
			`self.delta_params.append(c.bias)`
v0.3 updates 2022-10-14 23:15:38 +08:00			`else:`
support bmtrain 2022-10-23 16:42:21 +08:00			`if self.backend_mapping.check_type(c, 'linear') or \`
			`self.backend_mapping.check_type(c, 'layer_norm'):`
v0.3 updates 2022-10-14 23:15:38 +08:00			`bias = nn.Parameter(torch.empty(c.out_features), requires_grad=True)`
temporary add 2022-10-20 18:16:05 +08:00
fix bitfit 2022-11-20 10:19:25 +08:00			`self._reset_bias_parameters(c, bias)`
support bmtrain 2022-10-23 16:42:21 +08:00			`if self.backend == 'bmt':`
temporary add 2022-10-20 18:16:05 +08:00			`import bmtrain as bmt`
			`bias = bmt.BMTrainModelWrapper(bias)`
support bmtrain 2022-10-23 16:42:21 +08:00
v0.3 updates 2022-10-14 23:15:38 +08:00			`c.register_parameter('bias', bias)`
			`self.delta_params.append(bias)`
search 2022-04-14 11:22:41 +08:00
support bmtrain 2022-10-23 16:42:21 +08:00			`def add_bias_to_others(self, c):`
fix bitfit 2022-11-20 10:19:25 +08:00			`new_bias = BiasLayer(dtype=get_dtype(c), device=get_device(c), backend=self.backend)`
support bmtrain 2022-10-23 16:42:21 +08:00
temporary add 2022-10-20 18:16:05 +08:00			self.insert_sequential_module(c, delta_module=new_bias, delta_name="bitfit") # name shouldn't be `bias` here, since the name `bias` is reserved for some module such as roberta's LayerNorm.
first commit 2022-02-14 21:19:03 +08:00			`self.delta_modules.append(new_bias)`

			`@staticmethod`
fix bitfit 2022-11-20 10:19:25 +08:00			`def _reset_bias_parameters(linear_module, bias):`
first commit 2022-02-14 21:19:03 +08:00			`fan_in, _ = init._calculate_fan_in_and_fan_out(linear_module.weight)`
			`bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0`
fix bitfit 2022-11-20 10:19:25 +08:00			`init.uniform_(bias, -bound, bound)`
			`# init.uniform_(bias, -bound, bound)`
first commit 2022-02-14 21:19:03 +08:00
			`def detach(self, module):`
			`r"""Not implemented for BitFit yet. Please wait for the next version.`
			`"""`
			`raise NotImplementedError`
search 2022-04-14 11:22:41 +08:00
first commit 2022-02-14 21:19:03 +08:00			`def attach(self, module):`
			`r"""Not implemented for BitFit yet. Please wait for the next version.`
			`"""`
			`raise NotImplementedError`