merge parallel-adapter succeed
This commit is contained in:
parent
26e45110b2
commit
3867c0d8dc
|
@ -7,3 +7,8 @@
|
||||||
2. **Available Models with default configurations are ..., Please manually add the delta models by speicifying 'modified_modules' based on the visualization of your model structure**
|
2. **Available Models with default configurations are ..., Please manually add the delta models by speicifying 'modified_modules' based on the visualization of your model structure**
|
||||||
|
|
||||||
Although most pre-trained models (PTMs) use the transformers archtecture, they are implemented differently. For example, the attention module in GPT2 and BERT is not only named differently, but also implemented in different ways. Common structure mapping mapps the different name conventions of different PTMs into a unified name convention. But there are many PTMs that we do not currently cover. But don't worry! For these models, you can figure out which modules should you modify by simply [visualizing the PTMs](visualization), and then specify the `modified modules` manually (See [name-based addressing](namebasedaddr)).
|
Although most pre-trained models (PTMs) use the transformers archtecture, they are implemented differently. For example, the attention module in GPT2 and BERT is not only named differently, but also implemented in different ways. Common structure mapping mapps the different name conventions of different PTMs into a unified name convention. But there are many PTMs that we do not currently cover. But don't worry! For these models, you can figure out which modules should you modify by simply [visualizing the PTMs](visualization), and then specify the `modified modules` manually (See [name-based addressing](namebasedaddr)).
|
||||||
|
|
||||||
|
|
||||||
|
3. **Requires a dummy_inputs to be passed through the model to understand the dimensionality of each tensor in the computation graph. The {module.__class__.__name__} Class has no dummy_inputs, and automatically created dummy_inputs failed.**
|
||||||
|
|
||||||
|
The `dummy_inputs` can be any data that make `backbone_model.forward(**dummy_inputs)` succeed. Only the form and shape of the `dummy_inputs` matter. To set dummy_inputs for your model, please use: `setattr(backbone_model, 'dummy_inputs', some_dummy_inputs)` before initializing `{self.__class__.__name__}`.
|
|
@ -372,7 +372,7 @@ class DeltaBase(nn.Module, SaveLoadMixin):
|
||||||
except:
|
except:
|
||||||
_auto_dummy_fail = True
|
_auto_dummy_fail = True
|
||||||
if _auto_dummy_fail:
|
if _auto_dummy_fail:
|
||||||
raise AttributeError(f"\nThe {self.__class__.__name__} requires a pseudo-data to be passed through the model to understand the dimensionality of each tensor in the computation graph. \nThe automatically created dummy inputs failed.\nThe `dummy_inputs` can be any data that make `backbone_model.forward(**dummy_inputs)` succeed. Only the form and shape of the `dummy_inputs` matter.\n\tTo set dummy_inputs for your model, please use: `setattr(backbone_model, 'dummy_inputs', some_dummy_inputs)` before initializing `{self.__class__.__name__}` ")
|
raise AttributeError(f"\n\tThe {self.__class__.__name__} requires a dummy_inputs to be passed through the model to understand the dimensionality of each tensor in the computation graph. \n\t The {module.__class__.__name__} Class has no dummy_inputs, and automatically created dummy_inputs failed.\n\t Refer to `https://opendelta.readthedocs.io/en/latest/notes/faq.html` for detail.")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -804,13 +804,7 @@ class DeltaBase(nn.Module, SaveLoadMixin):
|
||||||
|
|
||||||
if _delta_info['method'] == "replace":
|
if _delta_info['method'] == "replace":
|
||||||
setattr(submodule, _delta_info["child_name"], _delta_info['org_module'])
|
setattr(submodule, _delta_info["child_name"], _delta_info['org_module'])
|
||||||
elif _delta_info['method'] == "insert_sequential":
|
elif _delta_info['method'] in ["sequential", "before", "after", "parallel"]:
|
||||||
if hasattr(submodule.forward, "__wrapped__"):
|
|
||||||
submodule.forward = submodule.forward.__wrapped__
|
|
||||||
delattr(submodule, _delta_info["delta_name"])
|
|
||||||
else:
|
|
||||||
raise AttributeError("submodule {}'s forward has no attribute __wrapped__. It's not a wrapped function.".format(name))
|
|
||||||
elif _delta_info['method'] == "insert_parallel":
|
|
||||||
if hasattr(submodule.forward, "__wrapped__"):
|
if hasattr(submodule.forward, "__wrapped__"):
|
||||||
submodule.forward = submodule.forward.__wrapped__
|
submodule.forward = submodule.forward.__wrapped__
|
||||||
delattr(submodule, _delta_info["delta_name"])
|
delattr(submodule, _delta_info["delta_name"])
|
||||||
|
|
|
@ -5,12 +5,9 @@ from opendelta.utils.signature import get_arg_names_inside_func
|
||||||
from opendelta.utils.name_based_addressing import *
|
from opendelta.utils.name_based_addressing import *
|
||||||
from opendelta.utils.cuda import get_device
|
from opendelta.utils.cuda import get_device
|
||||||
from opendelta.basemodel import DeltaBase
|
from opendelta.basemodel import DeltaBase
|
||||||
import loralib as lora
|
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch
|
import torch
|
||||||
import math
|
|
||||||
from opendelta.delta_models.layers.activations import Activations
|
from opendelta.delta_models.layers.activations import Activations
|
||||||
import inspect
|
|
||||||
from opendelta import BaseDeltaConfig
|
from opendelta import BaseDeltaConfig
|
||||||
import opendelta.utils.logging as logging
|
import opendelta.utils.logging as logging
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
@ -147,13 +144,16 @@ class ParallelAdapterModel(DeltaBase):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
config_class = ParallelAdapterConfig
|
config_class = ParallelAdapterConfig
|
||||||
delta_type = "adapter"
|
delta_type = "parallel_adapter"
|
||||||
default_modified_modules = ["attn", "attn", "ff.w1", "ff.w2"]
|
default_modified_modules = ["attn@", "attn@", "ff@.w1@", "ff@.w2@"]
|
||||||
|
# default_modified_modules = ["attn", "attn", "ff.w1", "ff.w2"]
|
||||||
|
_need_pseudo_data = True
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
backbone_model: nn.Module,
|
backbone_model: nn.Module,
|
||||||
bottleneck_dim: Optional[int]=24,
|
bottleneck_dim: Optional[int]=24,
|
||||||
non_linearity: Optional[str]='gelu_new',
|
non_linearity: Optional[str]='gelu_new',
|
||||||
modified_modules: Optional[bool] = None,
|
modified_modules: Optional[bool] = None,
|
||||||
|
exclude_modules: Optional[List[str]] = None,
|
||||||
unfrozen_modules: Optional[bool] = None,
|
unfrozen_modules: Optional[bool] = None,
|
||||||
common_structure: Optional[bool] = None,
|
common_structure: Optional[bool] = None,
|
||||||
interactive_modify: Optional[Union[bool, int]] = False,
|
interactive_modify: Optional[Union[bool, int]] = False,
|
||||||
|
@ -161,6 +161,7 @@ class ParallelAdapterModel(DeltaBase):
|
||||||
DeltaBase.__init__(self,
|
DeltaBase.__init__(self,
|
||||||
backbone_model,
|
backbone_model,
|
||||||
modified_modules=modified_modules,
|
modified_modules=modified_modules,
|
||||||
|
exclude_modules=exclude_modules,
|
||||||
unfrozen_modules=unfrozen_modules,
|
unfrozen_modules=unfrozen_modules,
|
||||||
common_structure=common_structure,
|
common_structure=common_structure,
|
||||||
interactive_modify=interactive_modify,
|
interactive_modify=interactive_modify,
|
||||||
|
|
Loading…
Reference in New Issue