diff --git a/.gitignore b/.gitignore index d7b8d7f..783241d 100644 --- a/.gitignore +++ b/.gitignore @@ -56,3 +56,4 @@ t.sh unittest/outputs/ +unittest/tmp/ diff --git a/README.md b/README.md index 9a4bc75..0f507f3 100644 --- a/README.md +++ b/README.md @@ -26,17 +26,17 @@ OpenDelta is a toolkit for parameter-efficient tuning methods (we dub it as *delta tuning*), by which users could flexibly assign (or add) a small amount parameters to update while keeping the most paramters frozen. By using OpenDelta, users could easily implement prefix-tuning, adapters, Lora, or any other types of delta tuning with preferred PTMs. -- Our repo is tested on Python 3.8 and PyTorch 1.9.0. Lower version may also be supported. +- Our repo is tested on Python 3.=-0 and PyTorch 1.9.0. Lower version may also be supported. - **A demo of using Opendelta to modify the PLM (E.g., BART).** ![How PLM changes using Delta-tuning](docs/source/imgs/demo.gif) ## News -- 2022.10.10 We merge new version into main. Key changes can be seen in [Update log](#updata_log) -- 2022.03.24 We notice several bugs in Soft Prompt Tuning and Prefix Tuning, mainly due to their need to customize attention ids, token_type_ids, we are fixing it! Currently, please use the other methods since they are stabler and better in performance. -- 2022.03.20 Add a [colab example](https://colab.research.google.com/drive/1uAhgAdc8Qr42UKYDlgUv0f7W1-gAFwGo?usp=sharing) to illustrate efficient training and space-saving multitask-serving. -- 2022.03.20 A new pip version released. -- 2022.02.16 Support [regular expression](https://opendelta.readthedocs.io/en/latest/notes/namebasedaddr.html#regexexpr) in named-based addressing. +- **2022.10.14** Release v0.3.0. We make the usage of default configurations of each delta tuning methods (i.e., the position they are attached) more friendly! If a custom model has our supported models as submodules inside, the default configuration is also available. Other key changes can be seen in [Update Log](file:///Users/hsd/codes/opendelta_doc/OpenDelta/docs/build/html/notes/update.html#version-0-3-0) +- **2022.03.24** We notice several bugs in Soft Prompt Tuning and Prefix Tuning, mainly due to their need to customize attention ids, token_type_ids, we are fixing it! Currently, please use the other methods since they are stabler and better in performance. +- **2022.03.20** Add a [colab example](https://colab.research.google.com/drive/1uAhgAdc8Qr42UKYDlgUv0f7W1-gAFwGo?usp=sharing) to illustrate efficient training and space-saving multitask-serving. +- **2022.03.20** A new pip version released. +- **2022.02.16** Support [regular expression](https://opendelta.readthedocs.io/en/latest/notes/namebasedaddr.html#regexexpr) in named-based addressing. ## Installation create a virtualenv (optional) @@ -74,7 +74,7 @@ python setup.py develop ``` #### Tips -- If you want to use mirror for installing the packages, please change the `index_url` in [setup.cfg](set.cfg) +- If you want to use mirror for installing the packages, please change the `index_url` in [setup.cfg](setup.cfg) - If you encounter network error using setup.py, please firstly install the dependencies via ```shell @@ -115,7 +115,6 @@ used models that OpenDelta are sure to support. | CTRL | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | -## Update Log -### version 0.3.0 + diff --git a/docs/source/conf.py b/docs/source/conf.py index e2f01a3..3731214 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -19,7 +19,9 @@ import datetime import sphinx_rtd_theme import doctest import opendelta -import opendelta.delta_models + + + # -- Project information ----------------------------------------------------- @@ -29,8 +31,8 @@ copyright = '{}, {}, Licenced under the Apache License, Version 2.0'.format(date # The full version, including alpha/beta/rc tags -release = '0.1.1' -version = "0.1.1" +release = '0.3.0' +version = "0.3.0" html_theme = 'sphinx_rtd_theme' html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] diff --git a/docs/source/index.md b/docs/source/index.md index b43f7da..dac3594 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,7 +1,7 @@ OpenDelta's documentation! ===================================== -OpenDelta is a **Plug-and-play** Library of the parameter-efficient fine-tuning ([delta-tuning](WhatisDelta)) technology for pre-trained models. +[OpenDelta](https://github.com/thunlp/OpenDelta/) is a **Plug-and-play** Library of the parameter-efficient fine-tuning ([delta-tuning](WhatisDelta)) technology for pre-trained models. ## Essential Advantages: @@ -35,12 +35,18 @@ OpenDelta is a **Plug-and-play** Library of the parameter-efficient fine-tuning notes/pluginunplug.md notes/acceleration.md notes/explored_config.md + +.. toctree:: + :maxdepth: 1 + :caption: Information + notes/citation.md + notes/update.md notes/faq.md .. toctree:: :maxdepth: 2 - :caption: Package Reference + :caption: Documentation modules/base modules/deltas diff --git a/docs/source/notes/citation.md b/docs/source/notes/citation.md index 4c41201..47a88ba 100644 --- a/docs/source/notes/citation.md +++ b/docs/source/notes/citation.md @@ -1,3 +1,12 @@ # Citation - We are working on a technical report. \ No newline at end of file +If you find our repo useful, please cite the following paper. + +``` +@article{ding2022delta, + title={Delta tuning: A comprehensive study of parameter efficient methods for pre-trained language models}, + author={Ding, Ning and Qin, Yujia and Yang, Guang and Wei, Fuchao and Yang, Zonghan and Su, Yusheng and Hu, Shengding and Chen, Yulin and Chan, Chi-Min and Chen, Weize and others}, + journal={arXiv preprint arXiv:2203.06904}, + year={2022} +} +``` \ No newline at end of file diff --git a/docs/source/notes/composition.md b/docs/source/notes/composition.md index 151aa37..a32db2c 100644 --- a/docs/source/notes/composition.md +++ b/docs/source/notes/composition.md @@ -1,10 +1,9 @@ -(composition)= # Composition of delta models With OpenDelta, you can perform compostion of different delta models. -### Add different deltas to the backbone +## Add different deltas to the backbone ``` from transformers import AutoModelForSequenceClassification @@ -18,14 +17,14 @@ delta_model.log() ```{figure} ../imgs/composition_of_delta.png --- width: 600px -name: defaultmodification +name: composition_of_delta --- ``` ```` -### Even add multiple delta to the same layer +## Even add multiple delta to the same layer ``` from transformers import AutoModelForSequenceClassification @@ -40,7 +39,7 @@ delta_model.log() ```{figure} ../imgs/multiple_to_one_layer.png --- width: 600px -name: defaultmodification +name: multiple_to_one_layer --- ``` ```` diff --git a/docs/source/notes/explored_config.md b/docs/source/notes/explored_config.md index 34bd1f4..f5d9052 100644 --- a/docs/source/notes/explored_config.md +++ b/docs/source/notes/explored_config.md @@ -1,11 +1,7 @@ (favoredconfiguration)= # Favored Configuration - We will add the commonly used configuration of delta models HERE in future. +Generally, the default configurations are already good enough. If we want squeeze the size of delta models further, you can refer to the following papers. -E.g. -- the modified_modules (position of delta), -- hyperparameter that are the most efficient -- the favored composition between delta models - -Currenlty, use the default setting, explore it by yourself, or refer to existing papers' configuration! \ No newline at end of file + - [AdapterDrop: On the Efficiency of Adapters in Transformers](https://arxiv.org/abs/2010.11918) + - [Sparse Structure Search for Parameter-Efficient Tuning(Delta Tuning)](https://arxiv.org/abs/2206.07382) \ No newline at end of file diff --git a/docs/source/notes/faq.md b/docs/source/notes/faq.md index f3cdd00..b7e4363 100644 --- a/docs/source/notes/faq.md +++ b/docs/source/notes/faq.md @@ -1,3 +1,3 @@ # FAQ -1. We haven't provide common structure mapping for this backbone model... +1. diff --git a/docs/source/notes/keyfeature.md b/docs/source/notes/keyfeature.md index b661469..dc71d82 100644 --- a/docs/source/notes/keyfeature.md +++ b/docs/source/notes/keyfeature.md @@ -38,7 +38,7 @@ We use three key functions to achieve the modifications to the backbone model ou - **parallel insertion** Adapters can also be used in a parallel fashion (see [Paper](https://arxiv.org/abs/2110.04366)). - For these methods, use [insert_parallel_module](opendelta.basemodel.DeltaBase.insert_parrellel_module) interface. + For these methods, use [insert_parallel_module](opendelta.basemodel.DeltaBase.insert_parallel_module) interface. :::{admonition} Doc-preserving Insertion diff --git a/docs/source/notes/knownissue.md b/docs/source/notes/knownissue.md deleted file mode 100644 index 139597f..0000000 --- a/docs/source/notes/knownissue.md +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/docs/source/notes/namebasedaddr.md b/docs/source/notes/namebasedaddr.md index 0e4e200..4987d6f 100644 --- a/docs/source/notes/namebasedaddr.md +++ b/docs/source/notes/namebasedaddr.md @@ -1,4 +1,4 @@ -(namebasedaddr)= + # Name-based Addressing Named based addressing is what set OpenDelta apart from other packages and provide the possibility to be used to a broader range of models (even emerging ones). @@ -52,7 +52,7 @@ In this case, string `"name_b.0.name_a"` will be the name to address the submodu Thus when applying a delta model to this toy net. -``` +```python from opendelta import AdapterModel AdapterModel(backbone_model=root, modified_modules=['name_b.0.name_a']) Visualization(root).structure_graph() @@ -67,7 +67,7 @@ name: toy-delta ``` ```` - +(targetmodules)= ## Target modules. For different delta methods, the operation for the modification target is different. @@ -88,7 +88,7 @@ Handcrafting the full names of submodules can be frustrating. We made some simpl 1. **End-matching** Rules. OpenDelta will take every modules that - **ends with** the provided name suffix as the modification [target module](target_module). + **ends with** the provided name suffix as the modification [target module](targetmodules). :::{admonition} Example :class: tip Taking DistilBert with an classifier on top as an example: @@ -115,7 +115,7 @@ Handcrafting the full names of submodules can be frustrating. We made some simpl :::{admonition} Regex in Json Configs :class: warning In json, you should write `"\\."` instead of `"\."` for a real dot due to json parsing rules. That is - ```json + ``` { ... "modified_moduls": ['[r][0-5]\\.attention'], @@ -138,7 +138,7 @@ Handcrafting the full names of submodules can be frustrating. We made some simpl delta_model = LoraModel(backbone_model=model, interactive_modify=True) ``` - by setting `interactive_modify`, a web server will be opened on local host, and the link will be print in the terminal. + by setting `interactive_modify`, a web server will be opened on local host, and the link will be print in the terminal, e.g., ``` http://0.0.0.0:8888/ diff --git a/docs/source/notes/pluginunplug.md b/docs/source/notes/pluginunplug.md index eeadd57..dae80c8 100644 --- a/docs/source/notes/pluginunplug.md +++ b/docs/source/notes/pluginunplug.md @@ -19,7 +19,7 @@ delta_model.log() ```{figure} ../imgs/plugunplug1.png --- width: 800px -name: defaultmodification +name: plugunplug1 --- ``` ```` @@ -33,7 +33,7 @@ delta_model.log() ```{figure} ../imgs/plugunplug2.png --- width: 800px -name: defaultmodification +name: plugunplug2 --- ``` ```` @@ -48,7 +48,7 @@ delta_model.log() ```{figure} ../imgs/plugunplug3.png --- width: 800px -name: defaultmodification +name: plugunplug3 --- ``` ```` @@ -67,7 +67,7 @@ delta_model2.log() ```{figure} ../imgs/plugunplug4.png --- width: 800px -name: defaultmodification +name: plugunplug4 --- ``` ```` @@ -81,7 +81,7 @@ delta_model.log() ```{figure} ../imgs/plugunplug5.png --- width: 800px -name: defaultmodification +name: plugunplug5 --- ``` ```` @@ -96,7 +96,7 @@ delta_model.log() ```{figure} ../imgs/plugunplug6.png --- width: 800px -name: defaultmodification +name: plugunplug6 --- ``` ```` diff --git a/docs/source/notes/saveload.md b/docs/source/notes/saveload.md index ecddd23..24a0ce1 100644 --- a/docs/source/notes/saveload.md +++ b/docs/source/notes/saveload.md @@ -1,4 +1,3 @@ -(saveload)= # Save and Share the Delta ## Space efficient saving without changing the code. @@ -95,4 +94,4 @@ If you are satisfied with your checkpoint, do not forget to share your model to ## Save & Load for Composition of Delta - Currently save & load method is not suitable for [composition of delta model](compositon). Please wait for future releases. \ No newline at end of file + Currently save & load method is not suitable for [composition](composition) of delta model. Please wait for future releases. \ No newline at end of file diff --git a/docs/source/notes/unifyname.md b/docs/source/notes/unifyname.md index c8117ee..f77fd68 100644 --- a/docs/source/notes/unifyname.md +++ b/docs/source/notes/unifyname.md @@ -1,4 +1,4 @@ -(unifyname)= +(commonstructure)= # Common Structure Mapping @@ -41,7 +41,7 @@ Visualize bert-base using a common structure name: The submodules that are not c ```{figure} ../imgs/commonstructure_vis.png :width: 600px -:name: transformers_structure +:name: commonstructure_vis ``` (mappingexample)= diff --git a/docs/source/notes/update.md b/docs/source/notes/update.md new file mode 100644 index 0000000..47b40e6 --- /dev/null +++ b/docs/source/notes/update.md @@ -0,0 +1,21 @@ +# Update Logs and Known Issues + + +## Version 0.3.0 +### Updates: +- Add this changelog for a granular record of updates. +- The default configuration of delta models can be applied to more wrapped models. + - There is less need to configure 'modified_modules' for wrapped models like [BertForSequenceClassification](https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification) or even [OpenMatch.DRModel](https://github.com/OpenMatch/OpenMatch/blob/master/src/openmatch/modeling/dense_retrieval_model.py#L37), as long as it has a model we support default configuration inside. **Note that if you customize `modified_modules` by yourself, most pytorch models are supported.** +- LoRA and BitFit models now does not need pseudo data to instantiate the model. +- BitFit models can now support [Conv1D](https://huggingface.co/docs/transformers/v4.23.1/en/internal/modeling_utils#transformers.Conv1D) using default configuration. +- Improve type hint for AutoDeltaModel. +- Fix bugs in documentation. +- Fix small bugs when saving a model without a config attributes. +- Make the default modified modules of adapter-like methods more accurate: attach the adapter-like modules after the output of attention layer and second feed-forward layer, both before the layernorm layers. +- A simple unit test folder containing development-time tests has been added for interested users. + + +### Known Issues +- SoftPrompt is still not supported for wrapped model if the model has no attribute `get_input_embeddings`. +- Prefix Tuning is still limited to T5, GPT2, Bart, Bert, Roberta. + diff --git a/docs/source/notes/usage.md b/docs/source/notes/usage.md index 6fc7ed4..c5d0614 100644 --- a/docs/source/notes/usage.md +++ b/docs/source/notes/usage.md @@ -12,7 +12,7 @@ model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base") ## STEP 2: Add delta modules We provide two alternatives to add the delta modules. ### 2.1 Modification based on visualization -Suppose we want to make the feedforward layer of each block as our [modification target module](target_module), +Suppose we want to make the feedforward layer of each block as our [modification target module](targetmodules), We should first know what is the name of the feedforward layer in the BART model by visualization. *For more about visualization, see [Visualization](visualization).* ```python @@ -48,7 +48,7 @@ delta_model.log() # This will visualize the backbone after modification and othe ### 2.2 Use the default modification. We also provide the default modifications of each delta methods for some commonly used PTMs (e.g., BERT, RoBERTA, DistilBERT, T5, GPT2), so the users don't need to specify the submodules to modify. -The default modifications is achieved by mapping a name of a submodule to it's name on a common transformer structure. *For details about the common structure mapping, see [Common Structure Mapping](unifyname)* +The default modifications is achieved by mapping a name of a submodule to it's name on a common transformer structure. *For details about the common structure mapping, see [Common Structure Mapping](commonstructure)* diff --git a/docs/source/notes/visualization.md b/docs/source/notes/visualization.md index d873d8e..ae6a805 100644 --- a/docs/source/notes/visualization.md +++ b/docs/source/notes/visualization.md @@ -1,4 +1,3 @@ -(visualization)= # Visualize the Parameters When OpenDelta makes modifications to a pretrained model (PTM), it is beneficial to know what your PTM looks like, especially the location of the parameters. diff --git a/opendelta/__init__.py b/opendelta/__init__.py index 6d38799..431cfa1 100644 --- a/opendelta/__init__.py +++ b/opendelta/__init__.py @@ -1,5 +1,5 @@ -__version__ = "0.2.4" +__version__ = "0.3.0" class GlobalSetting: def __init__(self): diff --git a/opendelta/auto_delta.py b/opendelta/auto_delta.py index 26cf570..45547a9 100644 --- a/opendelta/auto_delta.py +++ b/opendelta/auto_delta.py @@ -5,6 +5,7 @@ import torch.nn as nn from opendelta.utils.logging import get_logger import importlib from opendelta.delta_configs import BaseDeltaConfig +from opendelta.basemodel import DeltaBase logger = get_logger(__name__) @@ -80,7 +81,7 @@ LAZY_CONFIG_MAPPING = _LazyConfigMapping(DELTA_CONFIG_MAPPING) class AutoDeltaConfig: r""" This is a generic configuration class that will be instantiated as one of the configuration classes of the library - when created with the :py:meth:`~AutoConfig.from_pretrained` class method. + when created with the :meth:`~AutoDeltaConfig.from_finetuned` or :meth:`~AutoDeltaConfig.from_dict` class method. This class cannot be instantiated directly using ``__init__()`` (throws an error). """ @@ -98,8 +99,12 @@ class AutoDeltaConfig: config_dict (:obj:`dict`): The dict of configs of delta model. kwargs: Other keyword argument pass to initialize the config. - >>> config = AutoDeltaConfig.from_dict({"delta_type":"lora"}) # This will load the dault lora config. - >>> config = AutoDeltaConfig.from_dict({"delta_type":"lora", "lora_r":5}) # Will load the default lora config, with lora_r = 5 + Examples: + + .. code-block:: python + + config = AutoDeltaConfig.from_dict({"delta_type":"lora"}) # This will load the dault lora config. + config = AutoDeltaConfig.from_dict({"delta_type":"lora", "lora_r":5}) # Will load the default lora config, with lora_r = 5 """ config_dict = deepcopy(config_dict) @@ -119,54 +124,22 @@ class AutoDeltaConfig: Parameters: - finetuned_delta_path (:obj:`str` or :obj:`os.PathLike`, *optional*): - Can be either: + finetuned_delta_path (:obj:`str` or :obj:`os.PathLike`, *optional*): Can be either: + + - A string, the model id of a finetuned delta model configuration hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like ``Davin/lora``, or namespaced under a user or organization name, like ``DeltaHub/lora_t5-base_mrpc``. + - A path to a *directory* containing a configuration file saved using the :py:meth:`~opendelta.basemodel.DeltaBase.save_finetuned` method, e.g., ``./my_model_directory/``. + - A path or url to a saved configuration JSON *file*, e.g.,``./my_model_directory/configuration.json``. - - A string, the *model id* of a finetuned delta model configuration hosted inside a model repo on - huggingface.co. Valid model ids can be located at the root-level, like ``Davin/lora``, or - namespaced under a user or organization name, like ``DeltaHub/lora_t5-base_mrpc``. - - A path to a *directory* containing a configuration file saved using the - :py:meth:`DeltaBase.save_finetuned` method, - e.g., ``./my_model_directory/``. - - A path or url to a saved configuration JSON *file*, e.g., - ``./my_model_directory/configuration.json``. - The last two option are not tested but inherited from huggingface. cache_dir (:obj:`str` or :obj:`os.PathLike`, *optional*): Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used. - force_download (:obj:`bool`, *optional*, defaults to :obj:`False`): - Whether or not to force the (re-)download the model weights and configuration files and override the - cached versions if they exist. - resume_download (:obj:`bool`, *optional*, defaults to :obj:`False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. - proxies (:obj:`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - revision(:obj:`str`, *optional*, defaults to ``"main"``): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any - identifier allowed by git. - return_unused_kwargs (:obj:`bool`, *optional*, defaults to ``False``): - If ``False``, then this function returns just the final configuration object. - If ``True``, then this functions returns a ``Tuple(config, unused_kwargs)`` where *unused_kwargs* is a - dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., the - part of ``kwargs`` which has not been used to update ``config`` and is otherwise ignored. - trust_remote_code (:obj:`bool`, *optional*, defaults to ``False``): - Whether or not to allow for custom models defined on the Hub in their own modeling files. This option - should only be set to ``True`` for repositories you trust and in which you have read the code, as it will - execute code present on the Hub on your local machine. - kwargs(additional keyword arguments, *optional*): - The values in kwargs of any keys which are configuration attributes will be used to override the loaded - values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled - by the ``return_unused_kwargs`` keyword parameter. - + Examples: .. code-block:: python from transformers import AutoConfig - delta_config = AutoDeltaConfig.from_finetuned("DeltaHub/lora_t5-base-mrpc") + delta_config = AutoDeltaConfig.from_finetuned("thunlp/FactQA_T5-large_Adapter") """ @@ -325,7 +298,7 @@ class AutoDeltaModel: ) @classmethod - def from_config(cls, config, backbone_model, **kwargs): #-> "DeltaBase": + def from_config(cls, config, backbone_model, **kwargs) -> DeltaBase: r"""Automatically instantiates a delta model based on the :obj:`config`. The delta model correspond to the delta :obj:`config` will be loaded and initialized using the arguments in :obj:`config`. @@ -355,35 +328,28 @@ class AutoDeltaModel: ) @classmethod - def from_finetuned(cls, finetuned_delta_path, backbone_model, *model_args, **kwargs): + def from_finetuned(cls, finetuned_delta_path, backbone_model, *model_args, **kwargs) -> DeltaBase: r""" Automatically instantiated a delta model and load the finetuned checkpoints based on the :obj:`finetuned_delta_path`, which can either be a string pointing to a local path or a url pointint to the delta hub. It will check the hash after loading the delta model to see whether the correct backbone and delta checkpoint are used. Args: - finetuned_delta_path (:obj:`str` or :obj:`os.PathLike`, *optional*): - Can be either: + finetuned_delta_path (:obj:`str` or :obj:`os.PathLike`, *optional*): Can be either: - - A string, the *model id* of a finetuned delta model configuration hosted inside a model repo on - huggingface.co. Valid model ids can be located at the root-level, like ``Davin/lora``, or - namespaced under a user or organization name, like ``DeltaHub/lora_t5-base_mrpc``. - - A path to a *directory* containing a configuration file saved using the - :py:meth:`DeltaBase.save_finetuned` method, - e.g., ``./my_model_directory/``. - - A path or url to a saved configuration JSON *file*, e.g., - ``./my_model_directory/configuration.json``. - The last two option are not tested but inherited from huggingface. + - A string, the model name of a finetuned delta model configuration hosted inside a model repo on `Delta Center `_, like ``thunlp/FactQA_T5-large_Adapter``. + - A path to a directory containing a configuration file saved using the :meth:`~opendelta.utils.saving_loading_utils.SaveLoadMixin.save_finetuned` method, e.g., ``./my_model_directory/``. + - A path or url to a saved configuration JSON *file*, e.g., ``./my_model_directory/configuration.json``.The last two option are not tested but inherited from huggingface. backbone_model (:obj:`nn.Module`): The backbone model to be modified. - model_args: Other argument for initialize the model. - kwargs: Other kwargs that will be passed into DeltaBase.from_finetuned. + model_args: Other argument for initialize the model. See :`DeltaBase.from_finetuned` for details. + kwargs: Other kwargs that will be passed into DeltaBase.from_finetuned. See `DeltaBase.from_finetuned` for details. Example: .. code-block:: python - delta_model = AutoDeltaModel.from_finetuned("DeltaHub/lora_t5-base-mrpc", backbone_model) + delta_model = AutoDeltaModel.from_finetuned("thunlp/FactQA_T5-large_Adapter", backbone_model=5) """ delta_config = kwargs.pop("delta_config", None) diff --git a/opendelta/basemodel.py b/opendelta/basemodel.py index 4226fbf..9ab643f 100644 --- a/opendelta/basemodel.py +++ b/opendelta/basemodel.py @@ -138,7 +138,7 @@ class DeltaBase(nn.Module, SaveLoadMixin): if self.common_structure and self.structure_mapping is None: raise RuntimeError("Using common structure but the structure mapping is None") - def forward(self, *args, **kwargs) -> "RuntimeError": + def forward(self, *args, **kwargs) -> RuntimeError: r""" .. warning:: @@ -198,12 +198,12 @@ class DeltaBase(nn.Module, SaveLoadMixin): # create a new key list to avoid recursion. backbone_key_list = [key for key, _ in backbone.named_modules()] for key in backbone_key_list: - if self.find_key(key, modified_modules): - logger.debug("find key: {}".format(key)) + print(key) + if self.find_key(key, modified_modules): + print("found!") self.update_module(backbone, key) if self._need_pseudo_data: - self._pseudo_data_to_instantiate - + self._pseudo_data_to_instantiate(backbone) # mark the paratmers that are the delta parameters for easily displaying the delta_paramters. self.mark_as_delta() @@ -214,7 +214,10 @@ class DeltaBase(nn.Module, SaveLoadMixin): self._pseudo_data_to_instantiate_module(backbone) else: for key in self.structure_mapping.matched_pairs: - _, _, submodule = self.find_module(backbone, key) + if key == "": + submodule = backbone + else: + _, _, submodule = self.find_module(backbone, key) self._pseudo_data_to_instantiate_module(submodule) def mark_as_delta(self, module: nn.Module=None,): @@ -321,14 +324,16 @@ class DeltaBase(nn.Module, SaveLoadMixin): for x in self.exclude_modules: if key.startswith(x): # start with the excluded key return False - if self.common_structure: - key = self.structure_mapping.transform(key, strict=False) + if self.structure_mapping is not None: + key, virtual_key, in_virtual_order = self.structure_mapping.transform(key, strict=False) + # currently in_virtual_order not in use, it means that if the common structure designate adding adapter to FFN, it will be add to all submodule of FFN. if not key: return False - try: + if virtual_key is None: return endswith_in(key, target_list) - except: - raise RuntimeError("find_key exception") + else: + return endswith_in(key, target_list) or endswith_in(virtual_key, target_list) + def _pseudo_data_to_instantiate_module(self, module: Optional[nn.Module]=None): r"""Some delta model requires a pseudo-data be passed through the model to understand the dimensionality of each tensor in the computation graph. @@ -648,8 +653,6 @@ class DeltaBase(nn.Module, SaveLoadMixin): state_dict.pop(n) return state_dict includes = self.trainable_parameters_names(module) # use excludes will have trouble when the model have shared weights - # print(includes, "grad:",self.backbone_model.plm.lm_head.weight.requires_grad) - # exit() if hasattr(module.state_dict, "__wrapped__"): raise RuntimeWarning("The forward function might have been wrapped by a decorator, is it intended? Do you freeze the parameters twice?") module.state_dict = decorate(module.state_dict, _caller, extras=(includes,), kwsyntax=True) # decorator.decorate helps preserving the functions metadata (signature, etc.). diff --git a/opendelta/delta_configs.py b/opendelta/delta_configs.py index b84644b..ca722e4 100644 --- a/opendelta/delta_configs.py +++ b/opendelta/delta_configs.py @@ -23,51 +23,38 @@ class BaseDeltaConfig: Class attributes (overridden by derived classes): - - **delta_type** (:obj:`str`) -- the name of the delta modules, used to create the correct :py:class:`~opendelta.AutoConfig`. + - **delta_type** (:obj:`str`) -- the name of the delta modules, used to create the correct :py:class:`~opendelta.AutoConfig`. Args: - modified_modules (:obj:`List[str]`, *optional*, defaults to :obj:``None``) + modified_modules (:obj:`List[str]`, *optional*, defaults to :obj:`None`) The list of keys to determine which modules you want to modify. OpenDelta will take every modulees that **ends with** the one of the provided keys as the modification target. When not given any value, i.e. ``modified_modules=None``, the delta module will use the it corresponding default modification modules. Taking DistilBertModel with an classifier on top as an example: .. note:: - **Examples**: When adding delta to DistilBertModel, - - 1. set to ``["0.attention.out_lin"]`` will add delta modules to the attention output of distilbert's - ayer 0, i.e., ``distilbert.transformer.layer.0.attention.out_lin``. + **Examples**: When adding delta to `DistilBertModel `_, + + 1. set to ``["0.attention.out_lin"]`` will add delta modules to the attention output of distilbert's layer 0, i.e., ``distilbert.transformer.layer.0.attention.out_lin``. 2. set to ``["attention.out_lin"]`` will add the delta modules in every layer's ``attention.out_lin``. - unfrozen_modules (:obj:`List[str]`, *optional*, defaults to :obj:`["deltas"]` ) - exclude_modules (:obj:`str`, *optional*, default to :obj:`None`): The modules starts with these strings will - be excluded in modification. Note that currently only plain text (no regular expression) is supported. + unfrozen_modules (:obj:`List[str]`, *optional*, defaults to :obj:`["deltas"]` ): The modules that are unfrozen + during training in :meth:`~opendelta.basemodel.DeltaBase.freeze_module`, which includes the ones that are newly introduced as delta modules, and the ones that are originally a part of the model but set to trainable (:obj:`requires_grad=True`) to train together with the delta modules. Opendelta will take every modules that **ends with** the one of the provided keys and all its sub-modules and paramters as trainable. - The modules that are unfrozen - during training. Including the ones that are newly introduced as delta modules, and the ones that are - originally a part of the model but set to trainable (:obj:`requires_grad=True`) to train together with the - delta modules. OpenDelta will take every modules that **ends with** the one of the provided keys and all - its sub-modules and paramters as trainable. + exclude_modules (:obj:`str`, *optional*, default to :obj:`None`): The modules starts with these strings will be excluded in modification. Note that currently only plain text (no regular expression) is supported. .. note:: + **Examples**: When adding delta to DistilBertModel, - + 1. set this argument to ``["bias"]`` will make all bias terms tunable. - 2. set this argument to ``["attention"]`` will make all parameters in all attention modules tunable. - - 3. set this argument to ``["deltas"]`` will make all the parameters in the newly introduced delta - modules tunable. - + 3. set this argument to ``["deltas"]`` will make all the parameters in the newly introduced delta modules tunable. 4. set this argument to ``["classifier"]`` will make all parameters in the classifier tunable. + 5. set this argument to ``["3.ffn.lin2", "deltas", "classifier"]``, will make all parameters in the third layer's feed forward layer's send linear layer, the detla modules, and the classifiers modules tunable. - 5. set this argument to ``["3.ffn.lin2", "deltas", "classifier"]``, will make all parameters in - the third layer's feed forward layer's send linear layer, the detla modules, and the classifiers modules - tunable. - - common_structure (:obj:`bool`, *optional*, default to :obj:`None`): Whether using the common structure mapping of - the transformer model when designating :obj:`modified_modules` and :obj:`unfrozen_modules`. + common_structure (:obj:`bool`, *optional*, default to :obj:`None`): Whether using the common structure mapping of the transformer model when designating ``modified_modules` and ``unfrozen_modules``. backbone_class (:obj:`str`, *optional*, default to :obj:`None`): The name of backbone model's class, e.g. ``RobertaForMaskedLM``. Saving this infomation let the users explicitly know on which backbone the delta model is trained. @@ -106,13 +93,13 @@ class BaseDeltaConfig: Args: finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): This can be either: - * a string, the *model id* of a finetuned delta model configuration hosted inside a model repo on + - a string, the *model id* of a finetuned delta model configuration hosted inside a model repo on deltahub.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. - - * a path to a *directory* containing a configuration file saved using the :meth:`BaseDeltaConfig.save_finetuned` method, e.g., ``./my_model_directory/``. - - * a path or url to a saved configuration JSON *file*, e.g., ``./my_model_directory/configuration.json``. + + - a path to a *directory* containing a configuration file saved using the :meth:`BaseDeltaConfig.save_finetuned` method, e.g., ``./my_model_directory/``. + + - a path or url to a saved configuration JSON *file*, e.g., ``./my_model_directory/configuration.json``. cache_dir (:obj:`str` or :obj:`os.PathLike`, *optional*): Path to a directory in which a downloaded pretrained delta model configuration should be cached if the @@ -120,7 +107,7 @@ class BaseDeltaConfig: .. code-block:: python - delta_config = LoraConfig.from_finetuned("DeltaHub/lora_t5-base_mrpc") + delta_config = AdapterConfig.from_finetuned("thunlp/FactQA_T5-large_Adapter", backbone_model=t5) """ config_dict, kwargs = cls.get_config_dict(finetuned_delta_path, **kwargs) @@ -132,7 +119,7 @@ class BaseDeltaConfig: return cls.from_dict(config_dict, **kwargs) - def save_finetuned(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs): + def save_finetuned(self, save_directory: Union[str, os.PathLike], **kwargs): """ Save a configuration object to the directory :obj:`save_directory`, so that it can be re-loaded using the :meth:`BaseDeltaConfig.from_finetuned` class method. @@ -144,22 +131,15 @@ class BaseDeltaConfig: the Hugging Face model hub after saving it. .. warning:: - 1. Will raise error if you haven't config a Huggingface Model Hub. - 2. Using ``push_to_hub=True`` will synchronize the repository you are pushing to with ``save_directory``, - which requires ``save_directory`` to be a local clone of the repo you are pushing to if it's an existing - folder. Pass along ``temp_dir=True`` to use a temporary directory instead. - kwargs: - Additional key word arguments passed along to the - `PushToHubMixin.push_to_hub `_ method. + 1. Will raise error if you haven't config a Huggingface Model Hub. + 2. Using ``push_to_hub=True`` will synchronize the repository you are pushing to with ``save_directory``, which requires ``save_directory`` to be a local clone of the repo you are pushing to if it's an existing folder. Pass along ``temp_dir=True`` to use a temporary directory instead. + + kwargs: Additional key word arguments. """ if os.path.isfile(save_directory): raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") - if push_to_hub: - commit_message = kwargs.pop("commit_message", None) - repo = self._create_or_get_repo(save_directory, **kwargs) - os.makedirs(save_directory, exist_ok=True) # If we save using the predefined names, we can load using `from_pretrained` output_config_file = os.path.join(save_directory, CONFIG_NAME) @@ -167,9 +147,6 @@ class BaseDeltaConfig: self.to_json_file(output_config_file, use_diff=True) logger.info(f"Configuration saved in {output_config_file}") - if push_to_hub: - url = self._push_to_hub(repo, commit_message=commit_message) - logger.info(f"Configuration pushed to the hub in this commit: {url}") @classmethod def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "BaseDeltaConfig": @@ -354,8 +331,6 @@ class BaseDeltaConfig: def to_dict(self) -> Dict[str, Any]: """ Serializes this instance to a Python dictionary. - Returns: - :obj:`dict`: Dictionary of all the attributes that make up this configuration instance. """ output = copy.deepcopy(self.__dict__) if hasattr(self.__class__, "model_type"): diff --git a/opendelta/delta_models/adapter.py b/opendelta/delta_models/adapter.py index 0bb1bbc..752c003 100644 --- a/opendelta/delta_models/adapter.py +++ b/opendelta/delta_models/adapter.py @@ -194,7 +194,8 @@ class AdapterModel(DeltaBase): """ config_class = AdapterConfig delta_type = "adapter" - default_modified_modules = ["attn@", "ff@"] + default_modified_modules = ["attn@.proj@", "ff@.w2@"] + _need_pseudo_data = True def __init__(self, backbone_model: nn.Module, bottleneck_dim: Optional[int]=24, @@ -226,16 +227,16 @@ class AdapterModel(DeltaBase): ) - def add_all_delta_to_backbone(self, - module: nn.Module, - modified_modules: List[str], - ) -> nn.Module: - for key, _ in module.named_modules(): - if self.find_key(key, modified_modules): - self.update_module(module, key) - self._pseudo_data_to_instantiate(module) - self.mark_as_delta() - return module + # def add_all_delta_to_backbone(self, + # module: nn.Module, + # modified_modules: List[str], + # ) -> nn.Module: + # for key, _ in module.named_modules(): + # if self.find_key(key, modified_modules): + # self.update_module(module, key) + # self._pseudo_data_to_instantiate(module) + # self.mark_as_delta() + # return module def update_module(self, module: nn.Module, key: str): _, _, ref = self.find_module(module, key) diff --git a/opendelta/delta_models/bitfit.py b/opendelta/delta_models/bitfit.py index 7dce87c..c66fc5e 100644 --- a/opendelta/delta_models/bitfit.py +++ b/opendelta/delta_models/bitfit.py @@ -146,35 +146,28 @@ class BitFitModel(DeltaBase): ): if is_leaf_module(module): # if it is a leaf module, add bias to it regardless of its type. - if isinstance(module, nn.Linear): - self.add_bias_to_linear(module) + if isinstance(module, nn.Linear) or isinstance(module, nn.LayerNorm): + self.add_bias_to_modules_have_bias_or_known_type(module) else: # for example, layer_norms, lm_heads. self.add_bias_to_others(module) else: - # for the non-leaf modules, by default it will add bias only to the linear submodules. for n, c in module.named_modules(): - if isinstance(c, nn.Linear) or isinstance(c, nn.LayerNorm): - if c.bias is None: - bias = nn.Parameter(torch.empty(c.out_features), requires_grad=True) - c.register_parameter('bias', bias) - self._reset_bias_parameters(c) - self.delta_params.append(bias) - else: - c.bias.requires_grad = True - self.delta_params.append(c.bias) - else: - pass + self.add_bias_to_modules_have_bias_or_known_type(c) - def add_bias_to_linear(self, c): - if c.bias is None: - bias = nn.Parameter(torch.empty(c.out_features), requires_grad=True) - c.register_parameter('bias', bias) - self._reset_bias_parameters(c) - self.delta_params.append(bias) - else: + def add_bias_to_modules_have_bias_or_known_type(self, c): + '''If it has bias, unfreeze it. + If it doesn't have bias: if it is Linear of LN, add to it, else pass. + ''' + if 'bias' in [n for n,p in c.named_parameters()]: c.bias.requires_grad = True self.delta_params.append(c.bias) + else: + if isinstance(c, nn.Linear) or isinstance(c, nn.LayerNorm): # add bias + bias = nn.Parameter(torch.empty(c.out_features), requires_grad=True) + c.register_parameter('bias', bias) + self._reset_bias_parameters(c) + self.delta_params.append(bias) def add_bias_to_others(self, c): new_bias = BiasLayer() diff --git a/opendelta/delta_models/compacter.py b/opendelta/delta_models/compacter.py index 8842297..bec3d66 100644 --- a/opendelta/delta_models/compacter.py +++ b/opendelta/delta_models/compacter.py @@ -210,7 +210,8 @@ class CompacterModel(DeltaBase): """ config_class = CompacterConfig delta_type = "compacter" - default_modified_modules = ["attn@", "ff@"] + default_modified_modules = ["attn@.proj@", "ff@.w2@"] + _need_pseudo_data = True def __init__(self, backbone_model, modified_modules: Optional[List[str]] = None, @@ -260,16 +261,16 @@ class CompacterModel(DeltaBase): ) - def add_all_delta_to_backbone(self, - module: nn.Module, - modified_modules: List[str], - ) -> nn.Module: - for key, _ in module.named_modules(): - if self.find_key(key, modified_modules): - self.update_module(module, key) - self._pseudo_data_to_instantiate(module) - self.mark_as_delta() - return module + # def add_all_delta_to_backbone(self, + # module: nn.Module, + # modified_modules: List[str], + # ) -> nn.Module: + # for key, _ in module.named_modules(): + # if self.find_key(key, modified_modules): + # self.update_module(module, key) + # self._pseudo_data_to_instantiate(module) + # self.mark_as_delta() + # return module def update_module(self, module: nn.Module, key: str): _, _, ref = self.find_module(module, key) diff --git a/opendelta/delta_models/lora.py b/opendelta/delta_models/lora.py index c00d2d1..5806f6c 100644 --- a/opendelta/delta_models/lora.py +++ b/opendelta/delta_models/lora.py @@ -70,16 +70,17 @@ class LoraModel(DeltaBase): Thanks for their `loralib `_. .. note:: + In our implementation, we did not use loralib.linear to replace the linear layer of the backbone model. Instead, we insert a parallel module into the backbone. - In other words, we treat :math:`(W + A^TB) X` as :math:`WX+ A^TBX`, and insert the :math:`A^TBX` as a parallel insertion module. - If you want to use the original implementation, please refer to `lora_old.py` + In other words, we treat :math:`(W + A^TB) X` as :math:`WX+ A^TBX`, and insert the :math:`A^TBX` as a parallel insertion module. If you want to use the original implementation, please refer to `lora_old.py` class attributes: - - default_modified_modules = ['attn.q', 'attn.v'] According to the paper, they modify q and v matrix in the - attention layer. However, other linears can also be modified, and may lead to better performance. + + - default_modified_modules = ['attn.q', 'attn.v'] According to the paper, they modify q and v matrix in the attention layer. However, other linears can also be modified, and may lead to better performance. .. note:: + modified_modules should point to linear layer. We currently don't support broadcast to all linears in a module's child modules. diff --git a/opendelta/delta_models/low_rank_adapter.py b/opendelta/delta_models/low_rank_adapter.py index 2bc40e8..f23b00c 100644 --- a/opendelta/delta_models/low_rank_adapter.py +++ b/opendelta/delta_models/low_rank_adapter.py @@ -147,7 +147,8 @@ class LowRankAdapterModel(DeltaBase): config_class = LowRankAdapterConfig delta_type = "low_rank_adapter" - default_modified_modules = ['attn@', 'ff@'] + default_modified_modules = ["attn@.proj@", "ff@.w2@"] + _need_pseudo_data = True def __init__(self, backbone_model: nn.Module, reduction_factor = 32, @@ -180,16 +181,16 @@ class LowRankAdapterModel(DeltaBase): ) - def add_all_delta_to_backbone(self, - module: nn.Module, - modified_modules: List[str], - ) -> nn.Module: - for key, _ in module.named_modules(): - if self.find_key(key, modified_modules): - self.update_module(module, key) - self._pseudo_data_to_instantiate(module) - self.mark_as_delta() - return module + # def add_all_delta_to_backbone(self, + # module: nn.Module, + # modified_modules: List[str], + # ) -> nn.Module: + # for key, _ in module.named_modules(): + # if self.find_key(key, modified_modules): + # self.update_module(module, key) + # self._pseudo_data_to_instantiate(module) + # self.mark_as_delta() + # return module def update_module(self, module: nn.Module, key: str): _, _, ref = self.find_module(module, key) diff --git a/opendelta/delta_models/prefix.py b/opendelta/delta_models/prefix.py index 78a6a78..f64df2c 100644 --- a/opendelta/delta_models/prefix.py +++ b/opendelta/delta_models/prefix.py @@ -516,6 +516,7 @@ class PrefixModel(DeltaBase): config_class = PrefixConfig delta_type = "prefix" default_modified_modules = ['attn@'] + _need_pseudo_data = True def __init__(self, backbone_model: nn.Module, prefix_token_num=6, @@ -610,7 +611,7 @@ class PrefixModel(DeltaBase): module_device = get_device(module) prefixlayer = PrefixLayerBart(prefix_token_num=self.prefix_token_num, num_heads=module.num_heads ,device=module_device) else: - raise NotImplementedError(type(module)) + raise NotImplementedError(f"We haven't implement Prefix Tuning Layer for {module.__class__.__name__}. Please refer to https://opendelta.readthedocs.io/en/latest/notes/faq.html for detail.") return prefixlayer, module diff --git a/opendelta/delta_models/soft_prompt.py b/opendelta/delta_models/soft_prompt.py index c682132..95854dd 100644 --- a/opendelta/delta_models/soft_prompt.py +++ b/opendelta/delta_models/soft_prompt.py @@ -145,24 +145,23 @@ class SoftPromptModel(DeltaBase): you set ``n_token`` tokens template before the will give the same result. Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. soft_token_num (:obj:`int`, *optional*): num of new tokens to add in the front of the input. init_range (:obj:`float`, *optional*): If initialize new tokens randomly, the random range of uniform distribution. - token_init (:obj:`bool`, *optional*, default to :obj:`True`): Whether to initialize the new tokens with tokens of the plm - other_expand_ids (:obj:`dict`, *optional*, default to `{"attention_mask":1, "token_type_ids":0}`) The name of - other tokens and its default value that expand along with the input sequence. For example, when - you prepend 100 tokens to the input_ids, the attention_mask should be extended, and the token_type_ids should - be extended as well. - modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only - the implemented ones) - unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen - together with the prefix parameters. + token_init (:obj:`bool`, *optional*, default to :obj:`True`): Whether to initialize the new tokens with tokens of the PLM. + other_expand_ids (:obj:`dict`, *optional*, default to ``{'attention_mask':1, 'token_type_ids':0}``): The name of other tokens and its default value that expand along with the input sequence. For example, when you prepend 100 tokens to the input_ids, the attention_mask should be extended, and the token_type_ids should be extended as well. + modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only the implemented ones). + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen together with the prefix parameters. common_structure (:obj:`bool`): whether using name-based addressing with a common structure mapping. + """ + config_class = SoftPromptConfig delta_type = "soft_prompt" default_modified_modules = ["root"] # not used + _need_pseudo_data = False def __init__(self, backbone_model: nn.Module, soft_token_num=100, @@ -211,9 +210,7 @@ class SoftPromptModel(DeltaBase): def update_module(self): soft_prompt_layer = self.new_module_like(self.raw_embedding) - self.insert_sequential_module(self.backbone_model.get_encoder() if self.backbone_model.config.is_encoder_decoder else self.backbone_model, - delta_module=soft_prompt_layer, - delta_name="soft_prompt_layer" ) + self.insert_sequential_module(self.backbone_model.get_encoder() if self.backbone_model.config.is_encoder_decoder else self.backbone_model,delta_module=soft_prompt_layer,delta_name="soft_prompt_layer" ) def new_module_like(self, module): module_device = get_device(module) diff --git a/opendelta/utils/common_structures/__init__.py b/opendelta/utils/common_structures/__init__.py new file mode 100644 index 0000000..6c98f94 --- /dev/null +++ b/opendelta/utils/common_structures/__init__.py @@ -0,0 +1,24 @@ +CoreMappings = {} + +import importlib +import os +import sys + +cur_path = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, cur_path) + +filelist = os.listdir(cur_path) + +for file in filelist: + if not file.endswith(".py"): + continue + elif file.endswith("__init__.py"): + continue + else: + filename = file[:-3] + mappings = importlib.import_module(f".utils.common_structures.{filename}", "opendelta") + CoreMappings.update(mappings.Mappings) + + + + \ No newline at end of file diff --git a/opendelta/utils/common_structures/bert.py b/opendelta/utils/common_structures/bert.py new file mode 100644 index 0000000..b58a255 --- /dev/null +++ b/opendelta/utils/common_structures/bert.py @@ -0,0 +1,28 @@ +Mappings = {} + +Mappings['BertModel'] = { + "embeddings.word_embeddings": {"__name__":"embeddings"}, + "embeddings.position_embeddings": {"__name__":""}, + "embeddings.token_type_embeddings": {"__name__":""}, + "embeddings.LayerNorm": {"__name__":""}, + "encoder": {"__name__":"encoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "attention": {"__name__":"attn", + "self.query": {"__name__":"q"}, + "self.key": {"__name__":"k"}, + "self.value": {"__name__":"v"}, + "output.dense": {"__name__":"proj"}, + "output.LayerNorm": {"__name__":"layer_norm"}, + }, + "output": {"__name__":"ff", + "dense": {"__name__":"w2"}, + "LayerNorm": {"__name__":"layer_norm"} + }, + "intermediate": {"__name__":"ff", + "dense": {"__name__":"w1"}, + } + } + } + }, +} diff --git a/opendelta/utils/common_structures/debertav2.py b/opendelta/utils/common_structures/debertav2.py new file mode 100644 index 0000000..727d03b --- /dev/null +++ b/opendelta/utils/common_structures/debertav2.py @@ -0,0 +1,31 @@ + +Mappings = {} + +Mappings['DebertaV2Model'] = { + "embeddings.word_embeddings": {"__name__":"embeddings"}, + "embeddings.LayerNorm": {"__name__":""}, + "encoder": {"__name__":"encoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "attention": {"__name__":"attn", + "self.query_proj": {"__name__":"q"}, + "self.key_proj": {"__name__":"k"}, + "self.value_proj": {"__name__":"v"}, + "output.dense": {"__name__":"proj"}, + "output.LayerNorm": {"__name__":"layer_norm"}, + }, + "output": {"__name__":"ff", + "dense": {"__name__":"w2"}, + "LayerNorm": {"__name__":"layer_norm"} + }, + "intermediate.dense": {"__name__":"ff.w1"}, + } + }, + "rel_embeddings": {"__name__": ""}, + "LayerNorm": {"__name__": ""}, + "conv": {"__name__": "", + "conv": {"__name__": ""}, + "LayerNorm": {"__name__": ""} + } + }, +} \ No newline at end of file diff --git a/opendelta/utils/common_structures/gpt2.py b/opendelta/utils/common_structures/gpt2.py new file mode 100644 index 0000000..d486187 --- /dev/null +++ b/opendelta/utils/common_structures/gpt2.py @@ -0,0 +1,22 @@ + +Mappings = {} + +Mappings['GPT2Model'] = { + "wte": {"__name__":"embeddings"}, + "wpe": {"__name__":""}, + "h": {"__name__":"decoder.block", + "$": {"__name__":"$", + "attn": {"__name__":"attn", + "c_attn": {"__name__":"q,k,v"}, + "c_proj": {"__name__":"proj"}, + }, + "ln_1": {"__name__":"attn.layer_norm"}, + "mlp":{ "__name__": "ff", + "c_fc": {"__name__":"w1"}, + "c_proj": {"__name__":"w2"} + }, + "ln_2": {"__name__":"ff.layer_norm"}, + }, + }, + "ln_f": {"__name__":"decoder.layer_norm"}, +} \ No newline at end of file diff --git a/opendelta/utils/common_structures/opt.py b/opendelta/utils/common_structures/opt.py new file mode 100644 index 0000000..c1092e7 --- /dev/null +++ b/opendelta/utils/common_structures/opt.py @@ -0,0 +1,25 @@ + + +Mappings = {} +Mappings['OPTModel'] = { + "decoder.embed_tokens": {"__name__":"embeddings"}, + "decoder.embed_positions": {"__name__":""}, + "decoder.project_out": {"__name__":""}, + "decoder.project_in": {"__name__":""}, + "decoder": {"__name__":"decoder", + "layers": {"__name__":"block", + "$": {"__name__":"$", + "self_attn": {"__name__":"attn", + "q_proj": {"__name__":"q"}, + "k_proj": {"__name__":"k"}, + "v_proj": {"__name__":"v"}, + "out_proj": {"__name__":"proj"} + }, + "self_attn_layer_norm": {"__name__":"layer_norm"}, + "fc1": {"__name__":"ff.w1", "__virtual__": "ff", "__order__": "first"}, + "fc2": {"__name__":"ff.w2","__virtual__": "ff", "__order__": "last"}, + "final_layer_norm": {"__name__":"layer_norm"}, + } + } + } +} \ No newline at end of file diff --git a/opendelta/utils/common_structures/roberta.py b/opendelta/utils/common_structures/roberta.py new file mode 100644 index 0000000..94ce813 --- /dev/null +++ b/opendelta/utils/common_structures/roberta.py @@ -0,0 +1,27 @@ +Mappings = {} + +Mappings['RobertaModel'] = {"embeddings.word_embeddings": {"__name__":"embeddings"}, + "embeddings.position_embeddings": {"__name__":""}, + "embeddings.token_type_embeddings": {"__name__":""}, + "embeddings.LayerNorm": {"__name__":""}, + "encoder": {"__name__":"encoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "attention": {"__name__":"attn", + "self.query": {"__name__":"q"}, + "self.key": {"__name__":"k"}, + "self.value": {"__name__":"v"}, + "output.dense": {"__name__":"proj"}, + "output.LayerNorm": {"__name__":"layer_norm"}, + }, + "output": {"__name__":"ff", + "dense": {"__name__":"w2"}, + "LayerNorm": {"__name__":"layer_norm"} + }, + "intermediate": {"__name__":"ff", + "dense": {"__name__":"w1"}, + } + } + } + }, +} \ No newline at end of file diff --git a/opendelta/utils/common_structures/t5.py b/opendelta/utils/common_structures/t5.py new file mode 100644 index 0000000..8150fe2 --- /dev/null +++ b/opendelta/utils/common_structures/t5.py @@ -0,0 +1,71 @@ +Mappings = {} + +t5encoder = {"__name__":"encoder", + "embed_tokens": {"__name__":"embeddings"}, + "block": {"__name__":"block", + "$": {"__name__":"$", + "layer.0": {"__name__":"attn", + "SelfAttention.q": {"__name__":"q"}, + "SelfAttention.k": {"__name__":"k"}, + "SelfAttention.v": {"__name__":"v"}, + "SelfAttention.o": {"__name__":"proj"}, + "SelfAttention.relative_attention_bias": {"__name__":""}, + "layer_norm": {"__name__":"layer_norm"}, + }, + "layer.1": {"__name__":"ff", + "DenseReluDense.wi": {"__name__":"w1"}, + "layer_norm": {"__name__":"layer_norm"}, + "DenseReluDense.wo": {"__name__":"w2"}, + } + } + }, + "final_layer_norm": {"__name__":"layer_norm"}, + } + +t5decoder = {"__name__":"decoder", + "embed_tokens": {"__name__":"embeddings"}, + "block": {"__name__":"block", + "$": {"__name__":"$", + "layer.0": {"__name__":"attn", + "SelfAttention.q": {"__name__":"q"}, + "SelfAttention.k": {"__name__":"k"}, + "SelfAttention.v": {"__name__":"v"}, + "SelfAttention.o": {"__name__":"proj"}, + "SelfAttention.relative_attention_bias": {"__name__":""}, + "layer_norm": {"__name__":"layer_norm"}, + }, + "layer.1": {"__name__":"crossattn", + "EncDecAttention.q": {"__name__":"q"}, + "EncDecAttention.k": {"__name__":"k"}, + "EncDecAttention.v": {"__name__":"v"}, + "EncDecAttention.o": {"__name__":"proj"}, + "layer_norm": {"__name__":"layer_norm"}, + }, + "layer.2": {"__name__":"ff", + "DenseReluDense.wi": {"__name__":"w1"}, + "layer_norm": {"__name__":"layer_norm"}, + "DenseReluDense.wo": {"__name__":"w2"}, + } + } + }, + "final_layer_norm": {"__name__":"layer_norm"}, + } + + + +Mappings['T5Model'] = { + "shared": {"__name__":"embeddings"}, + "encoder": t5encoder, + "decoder": t5decoder, +} + +Mappings['T5ForConditionalGeneration'] = { + "shared": {"__name__":"embeddings"}, + "encoder": t5encoder, + "decoder": t5decoder, +} + +Mappings['T5EncoderModel'] = { + "shared": {"__name__":"embeddings"}, + "encoder": t5encoder, +} \ No newline at end of file diff --git a/opendelta/utils/saving_loading_utils.py b/opendelta/utils/saving_loading_utils.py index d8633b5..4b9b92e 100644 --- a/opendelta/utils/saving_loading_utils.py +++ b/opendelta/utils/saving_loading_utils.py @@ -90,19 +90,19 @@ class DeltaCenterArguments: class SaveLoadMixin: def add_configs_when_saving(self,): self.config.backbone_class = self.backbone_model.__class__.__name__ - self.config.backbone_checkpoint_name = os.path.split(self.backbone_model.config._name_or_path.strip("/"))[-1] + if hasattr(self.backbone_model, "config"): + self.config.backbone_checkpoint_name = os.path.split(self.backbone_model.config._name_or_path.strip("/"))[-1] self.config.backbone_hash = gen_model_hash(self.backbone_model) - def save_finetuned( self, finetuned_delta_path: Optional[Union[str, os.PathLike]] = "./delta_checkpoints/", save_config: bool = True, state_dict: Optional[dict] = None, save_function: Callable = torch.save, - push_to_dc: bool = True, + push_to_dc: bool = False, center_args: Optional[Union[DeltaCenterArguments, dict]] = dict(), center_args_pool: Optional[dict] = dict(), list_tags: Optional[List] = list(), @@ -173,10 +173,15 @@ class SaveLoadMixin: # Save the config if save_config: self.config.save_finetuned(save_directory) + - logger.info("\n"+"*"*30+f"\nYou delta models has been saved locally to:\n\t{os.path.abspath(save_directory)}" + + + + logger.info("\n"+"*"*30+f"\nYou delta models has been saved locally to:\t{os.path.abspath(save_directory)}" ) + self.compute_saving(output_model_file) state_dict_total_params = sum(p.numel() for p in state_dict.values()) other_tags={} @@ -191,6 +196,19 @@ class SaveLoadMixin: else: logger.info(f"Delay push: you can push it to the delta center later using \n\tpython -m DeltaCenter upload {os.path.abspath(save_directory)}\n" +"*"*30) + else: + logger.info("We encourage users to push their final and public models to delta center to share them with the community!") + + def compute_saving(self, output_model_file): + import os + stats = os.stat(output_model_file) + if stats.st_size > (1024**3): + unit = 'GB' + value = stats.st_size/(1024**3) + else: + unit = 'MB' + value = stats.st_size/(1024**2) + logger.info("The state dict size is {:.3f} {}".format(value, unit)) diff --git a/opendelta/utils/structure_mapping.py b/opendelta/utils/structure_mapping.py index 38fab3f..ea4b7ae 100644 --- a/opendelta/utils/structure_mapping.py +++ b/opendelta/utils/structure_mapping.py @@ -3,290 +3,20 @@ import copy import opendelta.utils.logging as logging from opendelta.utils.visualization import Visualization logger = logging.get_logger(__name__) -opt_mapping = { - "model.decoder.embed_tokens": {"__name__":"embeddings"}, - "model.decoder.embed_positions": {"__name__":""}, - "model.decoder.project_out": {"__name__":""}, - "model.decoder.project_in": {"__name__":""}, - "model.decoder": {"__name__":"decoder", - "layer": {"__name__":"block", - "$": {"__name__":"$", - "self_attn": {"__name__":"attn", - "q_proj": {"__name__":"q"}, - "k_proj": {"__name__":"k"}, - "v_proj": {"__name__":"v"}, - "out_proj": {"__name__":"proj"} - }, - "self_attn_layer_norm": {"__name__":"layer_norm"}, - "fc1": {"__name__":"ff.w1"}, - "fc2": {"__name__":"ff.w2"}, - "final_layer_norm": {"__name__":"layer_norm"}, - } - } - } -} - -t5_mapping = { - "shared": {"__name__":"embeddings"}, - "encoder": {"__name__":"encoder", - "embed_tokens": {"__name__":"embeddings"}, - "block": {"__name__":"block", - "$": {"__name__":"$", - "layer.0": {"__name__":"attn", - "SelfAttention.q": {"__name__":"q"}, - "SelfAttention.k": {"__name__":"k"}, - "SelfAttention.v": {"__name__":"v"}, - "SelfAttention.o": {"__name__":"proj"}, - "SelfAttention.relative_attention_bias": {"__name__":""}, - "layer_norm": {"__name__":"layer_norm"}, - }, - "layer.1": {"__name__":"ff", - "DenseReluDense.wi": {"__name__":"w1"}, - "layer_norm": {"__name__":"layer_norm"}, - "DenseReluDense.wo": {"__name__":"w2"}, - } - } - }, - "final_layer_norm": {"__name__":"layer_norm"}, - }, - "decoder": {"__name__":"decoder", - "embed_tokens": {"__name__":"embeddings"}, - "block": {"__name__":"block", - "$": {"__name__":"$", - "layer.0": {"__name__":"attn", - "SelfAttention.q": {"__name__":"q"}, - "SelfAttention.k": {"__name__":"k"}, - "SelfAttention.v": {"__name__":"v"}, - "SelfAttention.o": {"__name__":"proj"}, - "SelfAttention.relative_attention_bias": {"__name__":""}, - "layer_norm": {"__name__":"layer_norm"}, - }, - "layer.1": {"__name__":"crossattn", - "EncDecAttention.q": {"__name__":"q"}, - "EncDecAttention.k": {"__name__":"k"}, - "EncDecAttention.v": {"__name__":"v"}, - "EncDecAttention.o": {"__name__":"proj"}, - "layer_norm": {"__name__":"layer_norm"}, - }, - "layer.2": {"__name__":"ff", - "DenseReluDense.wi": {"__name__":"w1"}, - "layer_norm": {"__name__":"layer_norm"}, - "DenseReluDense.wo": {"__name__":"w2"}, - } - } - }, - "final_layer_norm": {"__name__":"layer_norm"}, - } -} -roberta_mapping = { - "roberta.embeddings.word_embeddings": {"__name__":"embeddings"}, - "roberta.embeddings.position_embeddings": {"__name__":""}, - "roberta.embeddings.token_type_embeddings": {"__name__":""}, - "roberta.embeddings.LayerNorm": {"__name__":""}, - "roberta.encoder": {"__name__":"encoder", - "layer": {"__name__":"block", - "$": {"__name__":"$", - "attention": {"__name__":"attn", - "self.query": {"__name__":"q"}, - "self.key": {"__name__":"k"}, - "self.value": {"__name__":"v"}, - "output.dense": {"__name__":"proj"}, - "output.LayerNorm": {"__name__":"layer_norm"}, - }, - "output": {"__name__":"ff", - "dense": {"__name__":"w2"}, - "LayerNorm": {"__name__":"layer_norm"} - }, - "intermediate.dense": {"__name__":"ff.w1"}, - } - } - }, - "lm_head": {"__name__":"lm_head", - "dense": {"__name__":""}, - "layer_norm": {"__name__":""}, - "decoder": {"__name__":"proj"}, - }, -} +from opendelta.utils.common_structures import CoreMappings - - -bert_mapping = { - "bert.embeddings.word_embeddings": {"__name__":"embeddings"}, - "bert.embeddings.position_embeddings": {"__name__":""}, - "bert.embeddings.token_type_embeddings": {"__name__":""}, - "bert.embeddings.LayerNorm": {"__name__":""}, - "bert.encoder": {"__name__":"encoder", - "layer": {"__name__":"block", - "$": {"__name__":"$", - "attention": {"__name__":"attn", - "self.query": {"__name__":"q"}, - "self.key": {"__name__":"k"}, - "self.value": {"__name__":"v"}, - "output.dense": {"__name__":"proj"}, - "output.LayerNorm": {"__name__":"layer_norm"}, - }, - "output": {"__name__":"ff", - "dense": {"__name__":"w2"}, - "LayerNorm": {"__name__":"layer_norm"} - }, - "intermediate.dense": {"__name__":"ff.w1"}, - } - } - }, - # "cls.predictions": {"__name__": "lm_head", - # "transform.dense": {"__name__":""}, - # "transform.LayerNorm": {"__name__":""}, - # "decoder": {"__name__":"proj"}, - # } -} - -bert_model_mapping = { - "bert.embeddings.word_embeddings": {"__name__":"embeddings"}, - "bert.embeddings.position_embeddings": {"__name__":""}, - "bert.embeddings.token_type_embeddings": {"__name__":""}, - "bert.embeddings.LayerNorm": {"__name__":""}, - "bert.encoder": {"__name__":"encoder", - "layer": {"__name__":"block", - "$": {"__name__":"$", - "attention": {"__name__":"attn", - "self.query": {"__name__":"q"}, - "self.key": {"__name__":"k"}, - "self.value": {"__name__":"v"}, - "output.dense": {"__name__":"proj"}, - "output.LayerNorm": {"__name__":"layer_norm"}, - }, - "output": {"__name__":"ff", - "dense": {"__name__":"w2"}, - "LayerNorm": {"__name__":"layer_norm"} - }, - "intermediate.dense": {"__name__":"ff.w1"}, - } - } - }, -} - -debertav2_mapping = { - "deberta.embeddings.word_embeddings": {"__name__":"embeddings"}, - "deberta.embeddings.LayerNorm": {"__name__":""}, - "deberta.encoder": {"__name__":"encoder", - "layer": {"__name__":"block", - "$": {"__name__":"$", - "attention": {"__name__":"attn", - "self.query_proj": {"__name__":"q"}, - "self.key_proj": {"__name__":"k"}, - "self.value_proj": {"__name__":"v"}, - "output.dense": {"__name__":"proj"}, - "output.LayerNorm": {"__name__":"layer_norm"}, - }, - "output": {"__name__":"ff", - "dense": {"__name__":"w2"}, - "LayerNorm": {"__name__":"layer_norm"} - }, - "intermediate.dense": {"__name__":"ff.w1"}, - } - }, - "rel_embeddings": {"__name__": ""}, - "LayerNorm": {"__name__": ""}, - "conv": {"__name__": "", - "conv": {"__name__": ""}, - "LayerNorm": {"__name__": ""} - } - }, - "lm_predictions.lm_head": {"__name__":"lm_head", - "dense": {"__name__":""}, - "LayerNorm": {"__name__":""}, - "bias": {"__name__": ""} - }, -} - -gpt2_mapping = { - "transformer.wte": {"__name__":"embeddings"}, - "transformer.wpe": {"__name__":""}, - "transformer.h": {"__name__":"decoder.block", - "$": {"__name__":"$", - "attn": {"__name__":"attn", - "c_attn": {"__name__":"q,k,v"}, - "c_proj": {"__name__":"proj"}, - }, - "ln_1": {"__name__":"attn.layer_norm"}, - "mlp":{ "__name__": "ff", - "c_fc": {"__name__":"w1"}, - "c_proj": {"__name__":"w2"} - }, - "ln_2": {"__name__":"ff.layer_norm"}, - }, - }, - "transformer.ln_f": {"__name__":"decoder.layernorm"}, - "lm_head": {"__name__":"lm_head.proj"}, -} - -distilbert_mapping = { - "distilbert.embeddings.word_embeddings": {"__name__":"embeddings"}, - "distilbert.embeddings.position_embeddings": {"__name__":""}, - "distilbert.embeddings.token_type_embeddings": {"__name__":""}, - "distilbert.embeddings.LayerNorm": {"__name__":""}, - "distilbert.transformer": {"__name__":"encoder", - "layer": {"__name__":"block", - "$": {"__name__":"$", - "attention": {"__name__":"attn", - "q_lin": {"__name__":"q"}, - "k_lin": {"__name__":"k"}, - "v_lin": {"__name__":"v"}, - "out_lin": {"__name__":"proj"}, - }, - "ffn": {"__name__":"ff", - "lin1": {"__name__":"w1"}, - "lin2": {"__name__":"w2"}, - }, - "sa_layer_norm": {"__name__":"attn.layer_norm"}, - "output_layer_norm":{"__name__": "ff.layer_norm"} - } - } - } -} - - -MAPPINGERROR_MSG = "We haven't provide common structure mapping for this backbone model or any of its inner modules. Please manually add the delta models by speicifying 'modified_modules' based on the visualization of model structure. Refer to `https://opendelta.readthedocs.io/en/latest/notes/faq.html` for detail." - -CoreMappings = {} - -CoreMappings['BertModel'] = { - "embeddings.word_embeddings": {"__name__":"embeddings"}, - "embeddings.position_embeddings": {"__name__":""}, - "embeddings.token_type_embeddings": {"__name__":""}, - "embeddings.LayerNorm": {"__name__":""}, - "encoder": {"__name__":"encoder", - "layer": {"__name__":"block", - "$": {"__name__":"$", - "attention": {"__name__":"attn", - "self.query": {"__name__":"q"}, - "self.key": {"__name__":"k"}, - "self.value": {"__name__":"v"}, - "output.dense": {"__name__":"proj"}, - "output.LayerNorm": {"__name__":"layer_norm"}, - }, - "output": {"__name__":"ff", - "dense": {"__name__":"w2"}, - "LayerNorm": {"__name__":"layer_norm"} - }, - "intermediate": {"__name__":"ff", - "dense": {"__name__":"w1"}, - } - } - } - }, -} +MAPPINGERROR_MSG = f"Available Models with default configurations are {list(CoreMappings.keys())} . Please manually add the delta models by speicifying 'modified_modules' based on the visualization of your model structure. Refer to `https://opendelta.readthedocs.io/en/latest/notes/faq.html` for detail." def transform(org_key, mapping, strict=True, warning=False, verbose=False): - chain = org_key.split(".") query = "" node = mapping new_chain = [] + virtual_key, virtual_chain, in_virtual_order = None, None, None for elem in chain: query += elem if query in node: @@ -303,7 +33,13 @@ def transform(org_key, mapping, strict=True, warning=False, verbose=False): splited_new_elem = new_elem.split(".") splited_new_elem = [e+"@" for e in splited_new_elem] special_token = '.'.join(splited_new_elem) + if '__virtual__' in node: + virtual_chain = copy.deepcopy(new_chain) + virtual_chain.append(".".join([e+'@' for e in node["__virtual__"].split(".")])) + in_virtual_order = node['__order__'] new_chain.append(special_token) # special token for transformed key + + query = "" elif "$" in node: node = node["$"] @@ -322,79 +58,16 @@ def transform(org_key, mapping, strict=True, warning=False, verbose=False): new_key = ".".join(new_chain) if verbose: print(f"{org_key} => {new_key}") - return new_key + if virtual_chain is not None: + virtual_key = ".".join(virtual_chain) - - - -def mapping_for_SequenceClassification(mapping, type): - mapping = copy.deepcopy(mapping) - if type == "roberta": - mapping.pop("lm_head") - mapping['classifier'] = {"__name__":"classifier", - "dense": {"__name__": "dense"}, - "out_proj": {"__name__":"out_proj"} - } - elif type == "bert": - mapping.pop("cls.predictions") - mapping["classifier"] = {"__name__": "classifier"} - elif type == "deberta": - mapping.pop("lm_predictions.lm_head") - mapping["pooler"] = {"__name__": "classifier"} - mapping["classifier"] = {"__name__": "classifier"} - else: - raise NotImplementedError - return mapping - -def mapping_for_ConditionalGeneration(mapping, type): - mapping = copy.deepcopy(mapping) - if type == "t5": - mapping["lm_head"] = {"__name__":"lm_head.proj"} - else: - raise NotImplementedError(MAPPINGERROR_MSG.format()) - return mapping - -def mapping_for_CausalLM(mapping, type): - mapping = copy.deepcopy(mapping) - if type == "opt": - mapping["lm_head"] = {"__name__":"lm_head.proj"} - else: - raise NotImplementedError - return mapping - -class _LazyLoading(OrderedDict): - def __init__(self, mapping): - self._mapping_string = mapping - self._mapping = {} - - def __getitem__(self, key): - if key not in self._mapping_string: - raise KeyError(MAPPINGERROR_MSG) - value = self._mapping_string[key] - self._mapping[key] = eval(value) - return self._mapping[key] - - def keys(self): - return list(self._mapping_string.keys()) - - def __contains__(self, item): - - return item in self._mapping_string + return new_key, virtual_key, in_virtual_order class CommonStructureMap(object): r""" A loading structure map. """ - # Mappings = _LazyLoading({ - # "RobertaForSequenceClassification": """mapping_for_SequenceClassification(roberta_mapping, "roberta")""", - # "RobertaForMaskedLM": "roberta_mapping", - # "BertModel": "bert_model_mapping", - # "T5ForConditionalGeneration": """mapping_for_ConditionalGeneration(t5_mapping, "t5")""", - # "DebertaV2ForSequenceClassification": """mapping_for_SequenceClassification(debertav2_mapping, "deberta")""", - # "CLIPModel":"""""", - # "OPTForCausalLM":"""mapping_for_CausalLM(opt_mapping,"opt")""" - # }) New_Mappings = CoreMappings @@ -411,20 +84,29 @@ class CommonStructureMap(object): return self.mapping def transform(self, org_key, strict=True, warning=False): + r'''Transform a key in the original model to the name convention in common structure. + ''' new_key = org_key + virtual_key, in_virtual_order = None, None + for key in self.matched_pairs: left, right = org_key[:len(key)], org_key[len(key):].strip(".") if left == key and len(right) > 0: - transformed_key = transform(right, self.matched_pairs[key], strict, warning) - print("11:", left, transformed_key) + transformed_key, virtual_key, in_virtual_order = transform(right, self.matched_pairs[key], strict, warning) if len(left) > 0: new_key = left + "." + transformed_key + else: + new_key = transformed_key break - return new_key + return new_key, virtual_key, in_virtual_order def find_sub_common_structure(self, module, prefix='',matched_pairs = []): if module.__class__.__name__ in self.New_Mappings: - mapping = self.New_Mappings[module.__class__.__name__] + if self.New_Mappings[module.__class__.__name__]: + if callable(self.New_Mappings[module.__class__.__name__]): + mapping = self.New_Mappings[module.__class__.__name__](module) + else: + mapping = self.New_Mappings[module.__class__.__name__] matched_pairs[prefix] = mapping return for name, m in module.named_children(): @@ -434,22 +116,3 @@ class CommonStructureMap(object): - - - - -if __name__ == "__main__": - from openprompt.plms import load_plm - import argparse - parser = argparse.ArgumentParser("") - parser.add_argument("--model", type=str, default='t5-lm', help="We test both t5 and t5-lm in this scripts, the corresponding tokenizerwrapper will be automatically loaded.") - parser.add_argument("--model_name_or_path", default="t5-base-lm-adapt") - parser.add_argument("--cache_base", default='/home/hushengding/plm_cache/') - parser.add_argument("--keep_non_params", action="store_true") - parser.add_argument("--expand_params", action="store_true") - args = parser.parse_args() - plm, tokenizer, model_config, WrapperClass = load_plm(args.model, args.cache_base+args.model_name_or_path) - - for name, _ in plm.named_modules(): - transform(name, t5_mapping, strict=True, warning=False) - diff --git a/setup.cfg b/setup.cfg index dc7762f..0642610 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [easy_install] -# index_url = https://pypi.org/simple +index_url = https://pypi.org/simple -index_url = https://pypi.tuna.tsinghua.edu.cn/simple \ No newline at end of file +# index_url = https://pypi.tuna.tsinghua.edu.cn/simple \ No newline at end of file diff --git a/setup.py b/setup.py index 1e76a4e..7655c1d 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ def get_requirements(): with open('README.md', 'r') as f: setuptools.setup( name = 'opendelta', - version = "0.2.4", + version = "0.3.0", description = "An open source framework for delta learning (parameter efficient learning).", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/unittest/test_bmtrain.py b/unittest/test_bmtrain.py index e69de29..5d843e7 100644 --- a/unittest/test_bmtrain.py +++ b/unittest/test_bmtrain.py @@ -0,0 +1,200 @@ + + + + +import time +import random +import torch +import bmtrain as bmt +import numpy as np +import os +import csv + +from model_center import get_args +from model_center.model import CPM2 +from model_center.tokenizer import CPM2Tokenizer +from model_center.dataset.cpm2dataset import DATASET +from model_center.utils import print_inspect +from model_center.dataset import DistributedDataLoader + +def get_tokenizer(args): + tokenizer = CPM2Tokenizer.from_pretrained(args.model_config) + return tokenizer + +def get_model(args): + model = CPM2.from_pretrained(args.model_config) + return model + +def get_optimizer(args, model): + optimizer = bmt.optim.AdamOffloadOptimizer(model.parameters(), weight_decay=args.weight_decay) + return optimizer + +def get_learning_rate_scheduler(args, optimizer): + if args.lr_decay_iters is None: + args.lr_decay_iters = args.train_iters * args.epochs + if args.lr_decay_style == "noam": + lr_scheduler = bmt.lr_scheduler.Noam(optimizer, + start_lr = args.lr, + warmup_iter = args.warmup_iters, + end_iter = args.lr_decay_iters, + num_iter = args.start_step) + elif args.lr_decay_style == "constant": + lr_scheduler = bmt.lr_scheduler.NoDecay(optimizer, + start_lr = args.lr, + warmup_iter = args.warmup_iters, + end_iter = -1, + num_iter = args.start_step) + elif args.lr_decay_style == "linear": + lr_scheduler = bmt.lr_scheduler.Linear(optimizer, + start_lr = args.lr, + warmup_iter = args.warmup_iters, + end_iter = args.lr_decay_iters, + num_iter = args.start_step) + elif args.lr_decay_style == "exponential": + lr_scheduler = bmt.lr_scheduler.Exponential(optimizer, + start_lr = args.lr, + warmup_iter = args.warmup_iters, + end_iter = args.lr_decay_iters, + num_iter = args.start_step) + elif args.lr_decay_style == "cosine": + lr_scheduler = bmt.lr_scheduler.Cosine(optimizer, + start_lr = args.lr, + warmup_iter = args.warmup_iters, + end_iter = args.lr_decay_iters, + num_iter = args.start_step) + else: + raise ValueError(f"lr_scheduler of type {args.lr_decay_style} is not supported yet.") + + return lr_scheduler + +def setup_model_and_optimizer(args): + # get the tokenizer + tokenizer = get_tokenizer(args) + # get the model + model = get_model(args) + bmt.synchronize() + # get the optimizer and lr_scheduler + optimizer = get_optimizer(args, model) + lr_scheduler = get_learning_rate_scheduler(args, optimizer) + bmt.synchronize() + # get the memory usage + bmt.print_rank("Model mem\n", torch.cuda.memory_summary()) + bmt.synchronize() + return tokenizer, model, optimizer, lr_scheduler + +def initialize(): + # get arguments + args = get_args() + # init bmt + bmt.init_distributed(seed = args.seed) + # init save folder + if args.save != None: + os.makedirs(args.save, exist_ok=True) + return args + +def prepare_dataset(args, tokenizer, base_path, dataset_name, rank, world_size): + splits = ['train', 'dev', 'test'] + dataset = {} + for split in splits: + dataset[split] = DATASET[dataset_name](base_path, split, rank, world_size, tokenizer, args.max_encoder_length, args.max_decoder_length) + verbalizer = torch.LongTensor(DATASET[dataset_name].get_verbalizer(tokenizer)).cuda() + return dataset, verbalizer + + +def finetune(args, tokenizer, model, optimizer, lr_scheduler, dataset, verbalizer): + loss_func = bmt.loss.FusedCrossEntropy(ignore_index=-100) + + optim_manager = bmt.optim.OptimManager(loss_scale=args.loss_scale) + optim_manager.add_optimizer(optimizer, lr_scheduler) + + dataloader = { + "train": DistributedDataLoader(dataset['train'], batch_size=args.batch_size, shuffle=True), + "dev": DistributedDataLoader(dataset['dev'], batch_size=args.batch_size, shuffle=False), + "test": DistributedDataLoader(dataset['test'], batch_size=args.batch_size, shuffle=False), + } + + for epoch in range(5): + model.train() + for it, data in enumerate(dataloader['train']): + enc_input = data["enc_input"] + enc_length = data["enc_length"] + dec_input = data["dec_input"] + dec_length = data["dec_length"] + targets = data["targets"] + index = data["index"] + + logits = model(enc_input, enc_length, dec_input, dec_length) + logits = logits.index_select(dim=-1, index=verbalizer) + logits = logits[torch.where(index==1)] + + loss = loss_func(logits, targets) + global_loss = bmt.sum_loss(loss).item() + + optim_manager.zero_grad() + + optim_manager.backward(loss) + grad_norm = optim_manager.clip_grad_norm(optimizer.param_groups, args.clip_grad, norm_type = 2) + + optim_manager.step() + + bmt.print_rank( + "train | epoch {:3d} | Iter: {:6d}/{:6d} | loss: {:.4f} | lr: {:.4e}, scale: {:10.4f} | grad_norm: {:.4f} |".format( + epoch, + it, + len(dataloader["train"]), + global_loss, + lr_scheduler.current_lr, + int(optim_manager.loss_scale), + grad_norm, + ) + ) + # if it % args.inspect_iters == 0: print_inspect(model, "*") + # if args.save != None and it % args.save_iters == 0: + # bmt.save(model, os.path.join(args.save, args.save_name+("-%d.pt" % it))) + + model.eval() + with torch.no_grad(): + acc = 0 + total = 0 + for it, data in enumerate(dataloader['dev']): + enc_input = data["enc_input"] + enc_length = data["enc_length"] + dec_input = data["dec_input"] + dec_length = data["dec_length"] + targets = data["targets"] + index = data["index"] + + logits = model(enc_input, enc_length, dec_input, dec_length) + logits = logits.index_select(dim=-1, index=verbalizer) + logits = logits[torch.where(index==1)] + logits = logits.argmax(dim=-1) + + acc += torch.sum(logits == targets).item() + total += logits.shape[0] + bmt.print_rank( + "dev | epoch {:3d} | Iter: {:6d}/{:6d} | acc: {:6d} | total: {:6d} |".format( + epoch, + it, + len(dataloader["dev"]), + acc, + total, + ) + ) + acc = torch.tensor(acc / total).cuda() + acc = bmt.sum_loss(acc).cpu().item() + bmt.print_rank(f"dev epoch {epoch}: accuracy: {acc}") + +def main(): + args = initialize() + tokenizer, model, optimizer, lr_scheduler = setup_model_and_optimizer(args) + dataset, verbalizer = prepare_dataset( + args, + tokenizer, + f"{args.base_path}/down_data/paraphrase", + args.dataset_name, + bmt.rank(), bmt.world_size(), + ) + finetune(args, tokenizer, model, optimizer, lr_scheduler, dataset, verbalizer) + +if __name__ == "__main__": + main() diff --git a/unittest/user_defined.py b/unittest/user_defined.py index d97c4fa..cd49e8b 100644 --- a/unittest/user_defined.py +++ b/unittest/user_defined.py @@ -1,29 +1,53 @@ # Adapted from Tevatron (https://github.com/texttron/tevatron) +from argparse import ArgumentParser import logging import os import sys - -from openmatch.arguments import DataArguments -from openmatch.arguments import DRTrainingArguments as TrainingArguments -from openmatch.arguments import ModelArguments -from openmatch.dataset import QPCollator, DRTrainDataset, DREvalDataset -from openmatch.modeling import DRModel -from openmatch.trainer import DRTrainer as Trainer -from openmatch.trainer import GCDenseTrainer -from transformers import AutoConfig, AutoTokenizer, HfArgumentParser, set_seed -from transformers.integrations import TensorBoardCallback +import torch.nn as nn logger = logging.getLogger(__name__) class UnitTest: - def __init__(self, model): - self.model = model + def __init__(self, models): + self.models = models + self.Configs = {} + self.Configs[0] = { + "delta_type": "lora", + } - def unitTest1(self, delta_config_dict): - model = self.model + self.Configs[1] = { + "delta_type": "bitfit", + } + + self.Configs[2] = { + "delta_type": "adapter", + } + + self.Configs[3] = { + "delta_type": "compacter", + } + + self.Configs[4] = { + "delta_type": "prefix", + } + + self.Configs[5] = { + "delta_type": "soft_prompt", + } + + self.Configs[6] = { + "delta_type": "low_rank_adapter", + } + + def get_delta_config(self, config_id): + return self.Configs[config_id] + + + def unitTest0(self, delta_config_dict): + model = self.models[0] from opendelta import Visualization Visualization(model).structure_graph() @@ -34,17 +58,78 @@ class UnitTest: from opendelta import Visualization Visualization(model).structure_graph() + + def unitTest1(self, delta_config_dict): + class Mymodel(nn.Module): + def __init__(self, a,b): + super().__init__() + self.a = a + self.b = b + + model = Mymodel(self.models[0], self.models[1]) + from opendelta import Visualization + Visualization(model).structure_graph() + from opendelta import AutoDeltaConfig, AutoDeltaModel + + delta_config = AutoDeltaConfig.from_dict(delta_config_dict) + delta_model = AutoDeltaModel.from_config(delta_config, backbone_model = model) + + from opendelta import Visualization + Visualization(model).structure_graph() + delta_model.save_finetuned("./tmp") + + delta_model.freeze_module(exclude=['deltas']) + delta_model.save_finetuned("./tmp") + + model = Mymodel(self.models[0], self.models[1]) + Visualization(model).structure_graph() + delta_model = AutoDeltaModel.from_finetuned("./tmp", backbone_model=model) + Visualization(model).structure_graph() + + + + + + + + def unit_test(self, test_id, config_id): + delta_config_dict = self.Configs[config_id] + if test_id == 0: + self.unitTest0(delta_config_dict) + elif test_id == 1: + self.unitTest1(delta_config_dict) + + +from dataclasses import dataclass, field + +@dataclass +class UnitTestArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + config_id: int = field( + default=0, + ) + test_id: int = field( + default=0, + ) + model_name_or_path: str =field( + default='bert-base-cased', + metadata={"help": "tested: bert-base-cased, roberta-base, rinna/japanese-gpt2-small, t5-small, facebook/opt-125m"} + ) + + +from transformers import HfArgumentParser,TrainingArguments, AutoModel, GPT2Model def main(): - parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) + parser = HfArgumentParser((TrainingArguments, UnitTestArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + training_args, unit_test_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: - model_args, data_args, training_args = parser.parse_args_into_dataclasses() - model_args: ModelArguments - data_args: DataArguments + training_args, unit_test_args = parser.parse_args_into_dataclasses() training_args: TrainingArguments if ( @@ -72,70 +157,25 @@ def main(): training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) - logger.info("MODEL parameters %s", model_args) - - set_seed(training_args.seed) - - num_labels = 1 - config = AutoConfig.from_pretrained( - model_args.config_name if model_args.config_name else model_args.model_name_or_path, - num_labels=num_labels, - cache_dir=model_args.cache_dir, - ) - tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, - use_fast=False, - ) - model = DRModel.build( - model_args, - data_args, - training_args, - config=config, - cache_dir=model_args.cache_dir, - ) - unit_test = UnitTest(model) + model = AutoModel.from_pretrained(unit_test_args.model_name_or_path) - # unit_test.unitTest1({ - # "delta_type": "bitfit", - # }) - # unit_test.unitTest1({ - # "delta_type": "lora", - # }) - unit_test.unitTest1({ - "delta_type": "adapter", - }) + import torch + import copy + models = [model, copy.deepcopy(model)] + + + unit_test = UnitTest(models) + + + unit_test.unit_test(unit_test_args.test_id, unit_test_args.config_id) + + - # train_dataset = DRTrainDataset(tokenizer, data_args, shuffle_seed=training_args.seed, cache_dir=data_args.data_cache_dir or model_args.cache_dir) - # eval_dataset = DREvalDataset(tokenizer, data_args, cache_dir=data_args.data_cache_dir or model_args.cache_dir) if data_args.eval_path is not None else None - - # tb_callback = TensorBoardCallback() - - # trainer_cls = GCDenseTrainer if training_args.grad_cache else Trainer - # trainer = trainer_cls( - # model=model, - # args=training_args, - # tokenizer=tokenizer, - # train_dataset=train_dataset, - # eval_dataset=eval_dataset, - # data_collator=QPCollator( - # tokenizer, - # max_p_len=data_args.p_max_len, - # max_q_len=data_args.q_max_len - # ), - # callbacks=[tb_callback] - # ) - # train_dataset.trainer = trainer - - # trainer.train() - # trainer.save_model() - # if trainer.is_world_process_zero(): - # tokenizer.save_pretrained(training_args.output_dir) if __name__ == "__main__":