diff --git a/.gitignore b/.gitignore index 9bb4b23..22d883c 100644 --- a/.gitignore +++ b/.gitignore @@ -35,4 +35,27 @@ log.txt **/examples/examples_bmtrain/BMPretrain **/examples/examples_bmtrain/BigModels/BigModels/results **/Delta_Memory/ +**/output/ +**/thunlp/ +**/saved_ckpts/ + + +DeltaCenter-Python-Client/ +backbone_structure +delta_checkpoints +gitop.sh +load_dataset_and_model.ipynb +load_model.py +scripts +t.py +t.sh +!examples/examples_prompt/configs/*/*.json +!examples/examples_prompt/configs/** +**/delta_checkpoints/ +**/outputs/ + + +**/unittest/** +!unittest/**.py +!unittest/**.sh diff --git a/README.md b/README.md index 1f84490..84c4867 100644 --- a/README.md +++ b/README.md @@ -26,16 +26,18 @@ OpenDelta is a toolkit for parameter-efficient tuning methods (we dub it as *delta tuning*), by which users could flexibly assign (or add) a small amount parameters to update while keeping the most paramters frozen. By using OpenDelta, users could easily implement prefix-tuning, adapters, Lora, or any other types of delta tuning with preferred PTMs. -- Our repo is tested on Python 3.8 and PyTorch 1.9.0. Lower version may also be supported. +- The latest version of OpenDelta is tested on Python==3.8.13, PyTorch==1.12.1, transformers==4.22.2. Other versions are likely to be supported as well. If you encounter bugs when using your own package versions, please raise an issue, we will look into it as soon as possible. - **A demo of using Opendelta to modify the PLM (E.g., BART).** ![How PLM changes using Delta-tuning](docs/source/imgs/demo.gif) -## Updates -- 2022.03.24 We notice several bugs in Soft Prompt Tuning and Prefix Tuning, mainly due to their need to customize attention ids, token_type_ids, we are fixing it! Currently, please use the other methods since they are stabler and better in performance. -- 2022.03.20 Add a [colab example](https://colab.research.google.com/drive/1uAhgAdc8Qr42UKYDlgUv0f7W1-gAFwGo?usp=sharing) to illustrate efficient training and space-saving multitask-serving. -- 2022.03.20 A new pip version released. -- 2022.02.16 Support [regular expression](https://opendelta.readthedocs.io/en/latest/notes/namebasedaddr.html#regexexpr) in named-based addressing. +## News +- **2022.10.14** Release v0.3.0. We make the usage of default configurations of each delta tuning methods (i.e., the position they are attached) more friendly! If a custom model has our supported models as submodules inside, the default configuration is also available. Other key changes can be seen in [Update Log](https://opendelta.readthedocs.io/en/latest/notes/update.html#version-0-3-0) +- **2022.10.10** Merge a long-developed branch v0.2.4 into the master branch. Key updates are (1) the an example unifying the delta tuning paradigm and the prompt-tuning paradigm; (2) and support for [Delta Center](https://www.openbmb.org/toolKits/deltacenter), whose webpage is still under construction. Details can be seen in [Update Log](https://opendelta.readthedocs.io/en/latest/notes/update.html#version-0-2-4) +- **2022.03.24** We notice several bugs in Soft Prompt Tuning and Prefix Tuning, mainly due to their need to customize attention ids, token_type_ids, we are fixing it! Currently, please use the other methods since they are stabler and better in performance. +- **2022.03.20** Add a [colab example](https://colab.research.google.com/drive/1uAhgAdc8Qr42UKYDlgUv0f7W1-gAFwGo?usp=sharing) to illustrate efficient training and space-saving multitask-serving. +- **2022.03.20** A new pip version released. +- **2022.02.16** Support [regular expression](https://opendelta.readthedocs.io/en/latest/notes/namebasedaddr.html#regexexpr) in named-based addressing. ## Installation create a virtualenv (optional) @@ -72,20 +74,95 @@ python setup.py install python setup.py develop ``` -## Must Try +#### Tips +- If you want to use mirror for installing the packages, please change the `index_url` in [setup.cfg](setup.cfg) -```python -from transformers import AutoModelForSeq2SeqLM -t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large") -from opendelta import AutoDeltaModel -delta = AutoDeltaModel.from_finetuned("thunlp/FactQA_T5-large_Adapter", backbone_model=t5) -delta.log() +- If you encounter network error using setup.py, please firstly install the dependencies via +```shell +pip install -r requirements.txt && python setup.py develop ``` -## Verified Supported Models +## Must Try +The following codes and comments walk you through the key functionality of OpenDelta. It is also in [must_try.py](https://github.com/thunlp/OpenDelta/tree/main/examples/unittest/must_try.py) + +```python +# use tranformers as usual. +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer +t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large") +t5_tokenizer = AutoTokenizer.from_pretrained("t5-large") +# A running example +inputs_ids = t5_tokenizer.encode("Is Harry Poter wrtten by JKrowling", return_tensors="pt") +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> '? Is it Harry Potter?' + + +# use existing delta models +from opendelta import AutoDeltaModel, AutoDeltaConfig +# use existing delta models from DeltaCenter +delta = AutoDeltaModel.from_finetuned("thunlp/Spelling_Correction_T5_LRAdapter_demo", backbone_model=t5) +# freeze the whole backbone model except the delta models. +delta.freeze_module() +# visualize the change +delta.log() + + +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> Is Harry Potter written by JK Rowling? + + +# Now save merely the delta models, not the whole backbone model, to tmp/ +delta.save_finetuned(".tmp") +import os; os.listdir(".tmp") +# >>> The state dict size is 1.443 MB +# >>> We encourage users to push their final and public models to delta center to share them with the community! + + +# reload the model from local url and add it to pre-trained T5. +t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large") +delta1 = AutoDeltaModel.from_finetuned(".tmp", backbone_model=t5) +import shutil; shutil.rmtree(".tmp") # don't forget to remove the tmp files. +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> Is Harry Potter written by JK Rowling? + +# detach the delta models, the model returns to the unmodified status. +delta1.detach() +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> '? Is it Harry Potter?' + +# use default configuration for cunstomized wrapped models which have PLMs inside. This is a common need for users. +import torch.nn as nn +class WrappedModel(nn.Module): + def __init__(self, inner_model): + super().__init__() + self.inner = inner_model + def forward(self, *args, **kwargs): + return self.inner(*args, **kwargs) + +wrapped_model = WrappedModel(WrappedModel(t5)) + +# say we use LoRA +delta_config = AutoDeltaConfig.from_dict({"delta_type":"lora"}) +delta2 = AutoDeltaModel.from_config(delta_config, backbone_model=wrapped_model) +delta2.log() +# >>> root +# -- inner +# -- inner +# ... +# ... lora_A:[8,1024], lora_B:[1024,8] +delta2.detach() + +# use a not default configuration +# say we add lora to the last four layer of the decoder of t5, with lora rank=5 +delta_config3 = AutoDeltaConfig.from_dict({"delta_type":"lora", "modified_modules":["[r]decoder.*((20)|(21)|(22)|(23)).*DenseReluDense\.wi"], "lora_r":5}) +delta3 = AutoDeltaModel.from_config(delta_config3, backbone_model=wrapped_model) +delta3.log() + +``` + +## Verified Default Configurations - **You can try to use OpenDelta on *any* backbone models based on PyTorch.** -- However, with small chances thatThe interface of the submodules of the backbone model is not supported. Therefore we verified some commonly +- However, with small chances that the interface of the submodules of the backbone model is not supported. Therefore we verified some commonly used models that OpenDelta are sure to support. - We will keep testing more and more emerging models. @@ -107,3 +184,5 @@ used models that OpenDelta are sure to support. + + diff --git a/dist/opendelta-0.2.0-py3-none-any.whl b/dist/opendelta-0.2.0-py3-none-any.whl new file mode 100644 index 0000000..c00ffc5 Binary files /dev/null and b/dist/opendelta-0.2.0-py3-none-any.whl differ diff --git a/dist/opendelta-0.2.0.tar.gz b/dist/opendelta-0.2.0.tar.gz new file mode 100644 index 0000000..c7468a2 Binary files /dev/null and b/dist/opendelta-0.2.0.tar.gz differ diff --git a/dist/opendelta-0.2.1-py3-none-any.whl b/dist/opendelta-0.2.1-py3-none-any.whl new file mode 100644 index 0000000..6fbe1ca Binary files /dev/null and b/dist/opendelta-0.2.1-py3-none-any.whl differ diff --git a/dist/opendelta-0.2.1.tar.gz b/dist/opendelta-0.2.1.tar.gz new file mode 100644 index 0000000..a915207 Binary files /dev/null and b/dist/opendelta-0.2.1.tar.gz differ diff --git a/dist/opendelta-0.2.2-py3-none-any.whl b/dist/opendelta-0.2.2-py3-none-any.whl new file mode 100644 index 0000000..f0d580e Binary files /dev/null and b/dist/opendelta-0.2.2-py3-none-any.whl differ diff --git a/dist/opendelta-0.2.2.tar.gz b/dist/opendelta-0.2.2.tar.gz new file mode 100644 index 0000000..5400092 Binary files /dev/null and b/dist/opendelta-0.2.2.tar.gz differ diff --git a/dist/opendelta-0.2.3-py3-none-any.whl b/dist/opendelta-0.2.3-py3-none-any.whl new file mode 100644 index 0000000..ac3e3d9 Binary files /dev/null and b/dist/opendelta-0.2.3-py3-none-any.whl differ diff --git a/dist/opendelta-0.2.3.tar.gz b/dist/opendelta-0.2.3.tar.gz new file mode 100644 index 0000000..2a2d57e Binary files /dev/null and b/dist/opendelta-0.2.3.tar.gz differ diff --git a/dist/opendelta-0.2.4-py3-none-any.whl b/dist/opendelta-0.2.4-py3-none-any.whl new file mode 100644 index 0000000..6f82355 Binary files /dev/null and b/dist/opendelta-0.2.4-py3-none-any.whl differ diff --git a/dist/opendelta-0.2.4.tar.gz b/dist/opendelta-0.2.4.tar.gz new file mode 100644 index 0000000..456ad09 Binary files /dev/null and b/dist/opendelta-0.2.4.tar.gz differ diff --git a/docs/requirements.txt b/docs/requirements.txt index 2f0ef65..2e18d0b 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,13 +1,17 @@ sphinx_copybutton sphinx_rtd_theme sphinx_toolbox -torch -transformers -sentencepiece==0.1.96 -tqdm==4.62.2 -openprompt -loralib +myst_parser + +torch>=1.8.0 +transformers>=4.10.0 +datasets==1.17.0 +sentencepiece>=0.1.96 +tqdm>=4.62.2 decorator rich -myst_parser -web.py \ No newline at end of file +web.py +gitpython +scipy # need? +sklearn # need? +delta_center_client==0.0.4 \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index e2f01a3..1be8e51 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -19,7 +19,9 @@ import datetime import sphinx_rtd_theme import doctest import opendelta -import opendelta.delta_models + + + # -- Project information ----------------------------------------------------- @@ -29,8 +31,8 @@ copyright = '{}, {}, Licenced under the Apache License, Version 2.0'.format(date # The full version, including alpha/beta/rc tags -release = '0.1.1' -version = "0.1.1" +release = '0.3.1' +version = "0.3.1" html_theme = 'sphinx_rtd_theme' html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] diff --git a/docs/source/index.md b/docs/source/index.md index acbf095..dac3594 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,7 +1,7 @@ OpenDelta's documentation! ===================================== -OpenDelta is a **Plug-and-play** Library of the parameter-efficient fine-tuning ([delta-tuning](WhatisDelta)) technology for pre-trained models. +[OpenDelta](https://github.com/thunlp/OpenDelta/) is a **Plug-and-play** Library of the parameter-efficient fine-tuning ([delta-tuning](WhatisDelta)) technology for pre-trained models. ## Essential Advantages: @@ -35,11 +35,18 @@ OpenDelta is a **Plug-and-play** Library of the parameter-efficient fine-tuning notes/pluginunplug.md notes/acceleration.md notes/explored_config.md + +.. toctree:: + :maxdepth: 1 + :caption: Information + notes/citation.md + notes/update.md + notes/faq.md .. toctree:: :maxdepth: 2 - :caption: Package Reference + :caption: Documentation modules/base modules/deltas diff --git a/docs/source/notes/citation.md b/docs/source/notes/citation.md index 4c41201..47a88ba 100644 --- a/docs/source/notes/citation.md +++ b/docs/source/notes/citation.md @@ -1,3 +1,12 @@ # Citation - We are working on a technical report. \ No newline at end of file +If you find our repo useful, please cite the following paper. + +``` +@article{ding2022delta, + title={Delta tuning: A comprehensive study of parameter efficient methods for pre-trained language models}, + author={Ding, Ning and Qin, Yujia and Yang, Guang and Wei, Fuchao and Yang, Zonghan and Su, Yusheng and Hu, Shengding and Chen, Yulin and Chan, Chi-Min and Chen, Weize and others}, + journal={arXiv preprint arXiv:2203.06904}, + year={2022} +} +``` \ No newline at end of file diff --git a/docs/source/notes/composition.md b/docs/source/notes/composition.md index 151aa37..a32db2c 100644 --- a/docs/source/notes/composition.md +++ b/docs/source/notes/composition.md @@ -1,10 +1,9 @@ -(composition)= # Composition of delta models With OpenDelta, you can perform compostion of different delta models. -### Add different deltas to the backbone +## Add different deltas to the backbone ``` from transformers import AutoModelForSequenceClassification @@ -18,14 +17,14 @@ delta_model.log() ```{figure} ../imgs/composition_of_delta.png --- width: 600px -name: defaultmodification +name: composition_of_delta --- ``` ```` -### Even add multiple delta to the same layer +## Even add multiple delta to the same layer ``` from transformers import AutoModelForSequenceClassification @@ -40,7 +39,7 @@ delta_model.log() ```{figure} ../imgs/multiple_to_one_layer.png --- width: 600px -name: defaultmodification +name: multiple_to_one_layer --- ``` ```` diff --git a/docs/source/notes/explored_config.md b/docs/source/notes/explored_config.md index 34bd1f4..3855376 100644 --- a/docs/source/notes/explored_config.md +++ b/docs/source/notes/explored_config.md @@ -1,11 +1,7 @@ (favoredconfiguration)= # Favored Configuration - We will add the commonly used configuration of delta models HERE in future. +Generally, the default configurations are already good enough. If you want squeeze the size of delta models further, you can refer to the following papers. -E.g. -- the modified_modules (position of delta), -- hyperparameter that are the most efficient -- the favored composition between delta models - -Currenlty, use the default setting, explore it by yourself, or refer to existing papers' configuration! \ No newline at end of file + - [AdapterDrop: On the Efficiency of Adapters in Transformers](https://arxiv.org/abs/2010.11918) + - [Sparse Structure Search for Parameter-Efficient Tuning(Delta Tuning)](https://arxiv.org/abs/2206.07382) \ No newline at end of file diff --git a/docs/source/notes/faq.md b/docs/source/notes/faq.md new file mode 100644 index 0000000..164c3a0 --- /dev/null +++ b/docs/source/notes/faq.md @@ -0,0 +1,14 @@ +# FAQs + +1. **Why I encounder NotImplementedError in Prefix Tuning?** + + This is because we find no easy way to get a unified Prefix Tuning implementation for different attention classes. If you really want to use Prefix Tuning for the models we have not supported, you can implement the ``PrefixLayerYOURMODEL`` on your own or raise a issue to request the feature for your model. + +2. **Available Models with default configurations are ..., Please manually add the delta models by speicifying 'modified_modules' based on the visualization of your model structure** + + Although most pre-trained models (PTMs) use the transformers archtecture, they are implemented differently. For example, the attention module in GPT2 and BERT is not only named differently, but also implemented in different ways. Common structure mapping mapps the different name conventions of different PTMs into a unified name convention. But there are many PTMs that we do not currently cover. But don't worry! For these models, you can figure out which modules should you modify by simply [visualizing the PTMs](visualization), and then specify the `modified modules` manually (See [name-based addressing](namebasedaddr)). + + +3. **Requires a dummy_inputs to be passed through the model to understand the dimensionality of each tensor in the computation graph. The {module.__class__.__name__} Class has no dummy_inputs, and automatically created dummy_inputs failed.** + + The `dummy_inputs` can be any data that make `backbone_model.forward(**dummy_inputs)` succeed. Only the form and shape of the `dummy_inputs` matter. To set dummy_inputs for your model, please use: `setattr(backbone_model, 'dummy_inputs', some_dummy_inputs)` before initializing `{self.__class__.__name__}`. \ No newline at end of file diff --git a/docs/source/notes/keyfeature.md b/docs/source/notes/keyfeature.md index b661469..dc71d82 100644 --- a/docs/source/notes/keyfeature.md +++ b/docs/source/notes/keyfeature.md @@ -38,7 +38,7 @@ We use three key functions to achieve the modifications to the backbone model ou - **parallel insertion** Adapters can also be used in a parallel fashion (see [Paper](https://arxiv.org/abs/2110.04366)). - For these methods, use [insert_parallel_module](opendelta.basemodel.DeltaBase.insert_parrellel_module) interface. + For these methods, use [insert_parallel_module](opendelta.basemodel.DeltaBase.insert_parallel_module) interface. :::{admonition} Doc-preserving Insertion diff --git a/docs/source/notes/namebasedaddr.md b/docs/source/notes/namebasedaddr.md index 0e4e200..4987d6f 100644 --- a/docs/source/notes/namebasedaddr.md +++ b/docs/source/notes/namebasedaddr.md @@ -1,4 +1,4 @@ -(namebasedaddr)= + # Name-based Addressing Named based addressing is what set OpenDelta apart from other packages and provide the possibility to be used to a broader range of models (even emerging ones). @@ -52,7 +52,7 @@ In this case, string `"name_b.0.name_a"` will be the name to address the submodu Thus when applying a delta model to this toy net. -``` +```python from opendelta import AdapterModel AdapterModel(backbone_model=root, modified_modules=['name_b.0.name_a']) Visualization(root).structure_graph() @@ -67,7 +67,7 @@ name: toy-delta ``` ```` - +(targetmodules)= ## Target modules. For different delta methods, the operation for the modification target is different. @@ -88,7 +88,7 @@ Handcrafting the full names of submodules can be frustrating. We made some simpl 1. **End-matching** Rules. OpenDelta will take every modules that - **ends with** the provided name suffix as the modification [target module](target_module). + **ends with** the provided name suffix as the modification [target module](targetmodules). :::{admonition} Example :class: tip Taking DistilBert with an classifier on top as an example: @@ -115,7 +115,7 @@ Handcrafting the full names of submodules can be frustrating. We made some simpl :::{admonition} Regex in Json Configs :class: warning In json, you should write `"\\."` instead of `"\."` for a real dot due to json parsing rules. That is - ```json + ``` { ... "modified_moduls": ['[r][0-5]\\.attention'], @@ -138,7 +138,7 @@ Handcrafting the full names of submodules can be frustrating. We made some simpl delta_model = LoraModel(backbone_model=model, interactive_modify=True) ``` - by setting `interactive_modify`, a web server will be opened on local host, and the link will be print in the terminal. + by setting `interactive_modify`, a web server will be opened on local host, and the link will be print in the terminal, e.g., ``` http://0.0.0.0:8888/ diff --git a/docs/source/notes/pluginunplug.md b/docs/source/notes/pluginunplug.md index eeadd57..dae80c8 100644 --- a/docs/source/notes/pluginunplug.md +++ b/docs/source/notes/pluginunplug.md @@ -19,7 +19,7 @@ delta_model.log() ```{figure} ../imgs/plugunplug1.png --- width: 800px -name: defaultmodification +name: plugunplug1 --- ``` ```` @@ -33,7 +33,7 @@ delta_model.log() ```{figure} ../imgs/plugunplug2.png --- width: 800px -name: defaultmodification +name: plugunplug2 --- ``` ```` @@ -48,7 +48,7 @@ delta_model.log() ```{figure} ../imgs/plugunplug3.png --- width: 800px -name: defaultmodification +name: plugunplug3 --- ``` ```` @@ -67,7 +67,7 @@ delta_model2.log() ```{figure} ../imgs/plugunplug4.png --- width: 800px -name: defaultmodification +name: plugunplug4 --- ``` ```` @@ -81,7 +81,7 @@ delta_model.log() ```{figure} ../imgs/plugunplug5.png --- width: 800px -name: defaultmodification +name: plugunplug5 --- ``` ```` @@ -96,7 +96,7 @@ delta_model.log() ```{figure} ../imgs/plugunplug6.png --- width: 800px -name: defaultmodification +name: plugunplug6 --- ``` ```` diff --git a/docs/source/notes/saveload.md b/docs/source/notes/saveload.md index ecddd23..24a0ce1 100644 --- a/docs/source/notes/saveload.md +++ b/docs/source/notes/saveload.md @@ -1,4 +1,3 @@ -(saveload)= # Save and Share the Delta ## Space efficient saving without changing the code. @@ -95,4 +94,4 @@ If you are satisfied with your checkpoint, do not forget to share your model to ## Save & Load for Composition of Delta - Currently save & load method is not suitable for [composition of delta model](compositon). Please wait for future releases. \ No newline at end of file + Currently save & load method is not suitable for [composition](composition) of delta model. Please wait for future releases. \ No newline at end of file diff --git a/docs/source/notes/unifyname.md b/docs/source/notes/unifyname.md index c8117ee..f77fd68 100644 --- a/docs/source/notes/unifyname.md +++ b/docs/source/notes/unifyname.md @@ -1,4 +1,4 @@ -(unifyname)= +(commonstructure)= # Common Structure Mapping @@ -41,7 +41,7 @@ Visualize bert-base using a common structure name: The submodules that are not c ```{figure} ../imgs/commonstructure_vis.png :width: 600px -:name: transformers_structure +:name: commonstructure_vis ``` (mappingexample)= diff --git a/docs/source/notes/update.md b/docs/source/notes/update.md new file mode 100644 index 0000000..26626d2 --- /dev/null +++ b/docs/source/notes/update.md @@ -0,0 +1,29 @@ +# Update Logs and Known Issues + +## Version 0.3.1 +- We update [must_try.py](https://github.com/thunlp/OpenDelta/tree/main/examples/unittest/must_try.py) for a simple introduction of the core functionality of OpenDelta. +- Thanks to [Weilin Zhao](https://github.com/Achazwl) We merge a long-developed branch parallel_adapter into the main branch. + + +## Version 0.3.0 +### Updates: +- Add this changelog for a granular record of updates. +- The default configuration of delta models can be applied to more wrapped models. + - There is less need to configure 'modified_modules' for wrapped models like [BertForSequenceClassification](https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification) or even [OpenMatch.DRModel](https://github.com/OpenMatch/OpenMatch/blob/master/src/openmatch/modeling/dense_retrieval_model.py#L37), as long as it has a model we support default configuration inside. **Note that if you customize `modified_modules` by yourself, most pytorch models are supported.** +- LoRA and BitFit models now does not need pseudo data to instantiate the model. +- BitFit models can now support [Conv1D](https://huggingface.co/docs/transformers/v4.23.1/en/internal/modeling_utils#transformers.Conv1D) using default configuration. +- Improve type hint for AutoDeltaModel. +- Fix bugs in documentation. +- Fix small bugs when saving a model without a config attributes. +- Make the default modified modules of adapter-like methods more accurate: attach the adapter-like modules after the output of attention layer and second feed-forward layer, both before the layernorm layers. +- A simple unit test folder containing development-time tests has been added for interested users. + + +### Known Issues +- SoftPrompt is still not supported for wrapped model if the model has no attribute `get_input_embeddings`. +- Prefix Tuning is still limited to T5, GPT2, Bart, Bert, Roberta. + +## Version 0.2.4 +### Updates +- examples/examples_seq2seq and examples/examples_text-classification is depreciated and moved to [legacy](https://github.com/thunlp/OpenDelta/tree/main/examples/legacies) +- Thanks to [Zhen Zhang](https://github.com/namezhenzhang), we provide [examples_prompt](https://github.com/thunlp/OpenDelta/tree/main/examples/examples_prompt), as a cleaner and more general framework, which unifies the delta tuning paradigm and the prompt-tuning paradigm. It is still based on [Huggingface Trainers](https://huggingface.co/docs/transformers/main_classes/trainer). In this example framework, the running pipeline is [a unified script](https://github.com/thunlp/OpenDelta/tree/main/examples/examples_prompt/src), the differences in tasks, models, delta tuning models, and even prompt-tuning paradigms are [more modular and be more independent ](https://github.com/thunlp/OpenDelta/tree/main/examples/examples_prompt/backbones). Please try it out! \ No newline at end of file diff --git a/docs/source/notes/usage.md b/docs/source/notes/usage.md index 6fc7ed4..c5d0614 100644 --- a/docs/source/notes/usage.md +++ b/docs/source/notes/usage.md @@ -12,7 +12,7 @@ model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base") ## STEP 2: Add delta modules We provide two alternatives to add the delta modules. ### 2.1 Modification based on visualization -Suppose we want to make the feedforward layer of each block as our [modification target module](target_module), +Suppose we want to make the feedforward layer of each block as our [modification target module](targetmodules), We should first know what is the name of the feedforward layer in the BART model by visualization. *For more about visualization, see [Visualization](visualization).* ```python @@ -48,7 +48,7 @@ delta_model.log() # This will visualize the backbone after modification and othe ### 2.2 Use the default modification. We also provide the default modifications of each delta methods for some commonly used PTMs (e.g., BERT, RoBERTA, DistilBERT, T5, GPT2), so the users don't need to specify the submodules to modify. -The default modifications is achieved by mapping a name of a submodule to it's name on a common transformer structure. *For details about the common structure mapping, see [Common Structure Mapping](unifyname)* +The default modifications is achieved by mapping a name of a submodule to it's name on a common transformer structure. *For details about the common structure mapping, see [Common Structure Mapping](commonstructure)* diff --git a/docs/source/notes/visualization.md b/docs/source/notes/visualization.md index d873d8e..ae6a805 100644 --- a/docs/source/notes/visualization.md +++ b/docs/source/notes/visualization.md @@ -1,4 +1,3 @@ -(visualization)= # Visualize the Parameters When OpenDelta makes modifications to a pretrained model (PTM), it is beneficial to know what your PTM looks like, especially the location of the parameters. diff --git a/examples/examples_prompt/README.md b/examples/examples_prompt/README.md index e9d5249..d6b3329 100644 --- a/examples/examples_prompt/README.md +++ b/examples/examples_prompt/README.md @@ -1,24 +1,59 @@ -# !!!!This example collection is still under develop, please wait for some time to use it. +# Examples of using opendelta together with 🤗 transformers. -## install the repo +In this repo, we construct a very general pipeline to train and test a PLM using +🤗 transformers. + +The pipeline was constructed together with [openpromptu](https://pypi.org/project/openpromptu/), which is a light and +model-agnostic version of [openprompt](https://github.com/thunlp/OpenPrompt). + +## Pool of PLMs +We are going to adapt most of the models in 🤗 transformers +in the repos. The different pipeline, processing, or configurations are specified +in `./backbones/`. You can add your own model in this file to support customized models. + + +### A example script to run the repo in offline mode ```bash -cd ../ -python setup_seq2seq.py develop +conda activate [YOURENV] +PATHBASE=[YOURPATH] + +JOBNAME="adapter_t5-base" +DATASET="superglue-cb" + +cd $PATHBASE/OpenDelta/examples/examples_prompt/ +python configs/gen_t5.py --job $JOBNAME + +export TRANSFORMERS_OFFLINE=1 +export HF_DATASETS_OFFLINE=1 +python src/run.py configs/$JOBNAME/$DATASET.json \ +--model_name_or_path [YOURPATH_TO_T5_BASE] \ +--tokenizer_name [YOURPATH_TO_T5_BASE] \ +--datasets_saved_path [YOURPATH_TO_CB_DATASETS] \ +--finetuned_delta_path ${PATHBASE}/delta_checkpoints/ \ +--num_train_epochs 20 \ +--bottleneck_dim 24 \ +--delay_push True ``` -This will add `examples_seq2seq` to the environment path of the python lib. -## Generating the json configuration file +## A example of quick testing the repo. -```shell -python configs/gen_$BACKBONETYPE.py --job $YOURJOB -#e.g. python configs/gen_beit.py --job lora_beit-base-patch16-224 -``` -The available job configuration (e.g., `--job lora_beit-base-patch16-224`) can be seen from the scripts. You can also -create your only configuration. +```bash +conda activate [YOURENV] +PATHBASE=[YOURPATH] +JOBNAME="adapter_t5-base" +DATASET="superglue-cb" -## Run the code +cd $PATHBASE/OpenDelta/examples/examples_prompt/ -``` -CUDA_VISIBLE_DEVICES=1 python src/run.py configs/lora_beit-base-patch16-224/beans.json -``` +export TRANSFORMERS_OFFLINE=1 +export HF_DATASETS_OFFLINE=1 +export DELTACENTER_OFFLINE=0 +python src/test.py configs/$JOBNAME/$DATASET.json \ +--model_name_or_path [YOURPATH_TO_T5_BASE] \ +--tokenizer_name [YOURPATH_TO_T5_BASE] \ +--datasets_saved_path [YOURPATH_TO_CB_DATASETS] \ +--finetuned_delta_path thunlp/t5-base_adapter_superglue-cb_20220701171436c80 \ +--delta_cache_dir "./delta_checkpoints/" \ +--force_download True +``` \ No newline at end of file diff --git a/examples/examples_prompt/backbones/bart.py b/examples/examples_prompt/backbones/bart.py index bab8303..6b9dd92 100644 --- a/examples/examples_prompt/backbones/bart.py +++ b/examples/examples_prompt/backbones/bart.py @@ -26,14 +26,14 @@ def preprocess_function(raw_example, **kwargs): example = InputExample(**raw_example) - try: - example = verbalizer.wrap_one_example(example) - example, other = template.wrap_one_example(example) - input_sentence = tokenizer_wrapper.merge_wrapped_example(example) - model_inputs = tokenizer(input_sentence, max_length=256, - padding="max_length", truncation=True) - except: - from IPython import embed; embed(header="Therer") + + example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=256, + padding="max_length", truncation=True) + + with tokenizer.as_target_tokenizer(): label = tokenizer(other['tgt_text']).input_ids @@ -43,7 +43,8 @@ def preprocess_function(raw_example, **kwargs): def get_backbone(model_args, **kwargs): config = AutoConfig.from_pretrained( - model_args.config_name if model_args.config_name else model_args.model_name_or_path, + # model_args.config_name if model_args.config_name else model_args.model_name_or_path, + model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, diff --git a/examples/examples_prompt/backbones/beit.py b/examples/examples_prompt/backbones/beit.py index 4494fed..c35bd4e 100644 --- a/examples/examples_prompt/backbones/beit.py +++ b/examples/examples_prompt/backbones/beit.py @@ -8,7 +8,6 @@ from transformers import ( AutoFeatureExtractor, AutoModelForImageClassification, ) -from transformers import ViTFeatureExtractor from transformers import Trainer as HfTrainer import torch.nn as nn @@ -26,9 +25,10 @@ def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): def preprocess_function(raw_example, **kwargs): # from IPython import embed; embed(header="Therefa") tokenizer = kwargs['tokenizer'] - model_inputs = tokenizer(raw_example['image'], return_tensors='pt') + # print(np.array(raw_example['img']).shape) + model_inputs = tokenizer(np.array(raw_example['image']), return_tensors='pt') model_inputs['pixel_values'] = model_inputs['pixel_values'].squeeze() - model_inputs['labels'] = raw_example['labels'] + model_inputs['labels'] = raw_example['label'] return model_inputs def compute_metrics(eval_preds, dataset_name, eval_metric): @@ -55,7 +55,7 @@ def mask_token_func(tokenizer, ith_mask=0): def get_remove_columns(dataset_features): # dataset_features.pop("label") - print("remove_columns: {}".format(dataset_features)) + # print("remove_columns: {}".format(dataset_features)) return dataset_features class DataCollator(HfDataCollatorMixin): diff --git a/examples/examples_prompt/backbones/bigbird_.py b/examples/examples_prompt/backbones/bigbird_.py new file mode 100644 index 0000000..8945103 --- /dev/null +++ b/examples/examples_prompt/backbones/bigbird_.py @@ -0,0 +1,169 @@ +from openpromptu.data_utils import InputExample +import torch +from transformers.data.data_collator import torch_default_data_collator +from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin +from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator +import numpy as np +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, +) + +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer +import copy +from torch.nn import CrossEntropyLoss + +def preprocess_function(raw_example, **kwargs): + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + + example = InputExample(**raw_example) + # example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, + padding="max_length", truncation=True) + return model_inputs + + + +def compute_metrics(eval_preds, dataset_name, eval_metric): + pass + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.pad_token + +def get_remove_columns(dataset_features): + # dataset_features.remove("label") + return dataset_features + +def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): + from openpromptu.prompts import GenerationVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + return template, verbalizer, tokenizer_wrapper + + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + # config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + return config, tokenizer, model + +class Trainer(HfSeq2SeqTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.eval_task = eval_task + self.compute_metrics = self._compute_metrics + + def compute_loss(self, model, inputs, return_outputs=False): + + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1)) + + return (loss, outputs) if return_outputs else loss + + def prediction_step( + self, + model, #nn.Module, + inputs, #Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only, #: bool, + ignore_keys, #: Optional[List[str]] = None, + ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + inputs = self._prepare_inputs(inputs) + with torch.no_grad(): + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous().long() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu() + loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss) + + if prediction_loss_only: + return (loss, None, None) + else: + # non pad label + shift_labels = shift_labels.view(-1).detach().cpu() + nonpad_idx = shift_labels!=self.tokenizer.pad_token_id + shift_labels = shift_labels[nonpad_idx] + # the probability at the corresponding position + shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu() + target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device) + shift_logits = shift_logits.softmax(dim=-1)[target_position] + + + return (loss, shift_logits, shift_labels) + + def _compute_metrics(self, eval_preds): + + preds, labels = eval_preds + + result = {} + for metric in self.eval_task.metric: + result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id)) + + average_metric = sum(result.values())/len(result) + result.update({"average_metrics":average_metric}) + return result \ No newline at end of file diff --git a/examples/examples_prompt/backbones/blenderbot.py b/examples/examples_prompt/backbones/blenderbot.py index c1e8876..54e4ec8 100644 --- a/examples/examples_prompt/backbones/blenderbot.py +++ b/examples/examples_prompt/backbones/blenderbot.py @@ -26,14 +26,13 @@ def preprocess_function(raw_example, **kwargs): example = InputExample(**raw_example) - try: - example = verbalizer.wrap_one_example(example) - example, other = template.wrap_one_example(example) - input_sentence = tokenizer_wrapper.merge_wrapped_example(example) - model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, - padding="max_length", truncation=True) - except: - from IPython import embed; embed(header="Therer") + + example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, + padding="max_length", truncation=True) + with tokenizer.as_target_tokenizer(): label = tokenizer(other['tgt_text']).input_ids @@ -165,7 +164,7 @@ class Trainer(HfSeq2SeqTrainer): return (loss, generated_tokens, labels) def _compute_metrics(self, eval_preds): - from IPython import embed; embed(header="In compute metrics") + # from IPython import embed; embed(header="In compute metrics") preds, labels = eval_preds decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) diff --git a/examples/examples_prompt/backbones/opt.py b/examples/examples_prompt/backbones/opt.py new file mode 100644 index 0000000..5902bc9 --- /dev/null +++ b/examples/examples_prompt/backbones/opt.py @@ -0,0 +1,171 @@ +from openpromptu.data_utils import InputExample +import torch +from transformers.data.data_collator import torch_default_data_collator +from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin +from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator +import numpy as np +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, +) + +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer +import copy +from torch.nn import CrossEntropyLoss + +def preprocess_function(raw_example, **kwargs): + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + + example = InputExample(**raw_example) + # example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, + padding="max_length", truncation=True) + return model_inputs + + + +def compute_metrics(eval_preds, dataset_name, eval_metric): + pass + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.pad_token + +def get_remove_columns(dataset_features): + # dataset_features.remove("label") + return dataset_features + +def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): + from openpromptu.prompts import GenerationVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="tail", mask_token_func=mask_token_func) + return template, verbalizer, tokenizer_wrapper + + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + # config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + if not hasattr(tokenizer,"pad_token") or (hasattr(tokenizer,"pad_token") and tokenizer.pad_token==None): + tokenizer.pad_token = tokenizer.eos_token + + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + return config, tokenizer, model + +class Trainer(HfSeq2SeqTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.eval_task = eval_task + self.compute_metrics = self._compute_metrics + + def compute_loss(self, model, inputs, return_outputs=False): + + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1)) + + return (loss, outputs) if return_outputs else loss + + def prediction_step( + self, + model, #nn.Module, + inputs, #Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only, #: bool, + ignore_keys, #: Optional[List[str]] = None, + ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + inputs = self._prepare_inputs(inputs) + with torch.no_grad(): + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous().long() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu() + loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss) + + if prediction_loss_only: + return (loss, None, None) + else: + # non pad label + shift_labels = shift_labels.view(-1).detach().cpu() + nonpad_idx = shift_labels!=self.tokenizer.pad_token_id + shift_labels = shift_labels[nonpad_idx] + # the probability at the corresponding position + shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu() + target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device) + shift_logits = shift_logits.softmax(dim=-1)[target_position] + + + return (loss, shift_logits, shift_labels) + + def _compute_metrics(self, eval_preds): + + preds, labels = eval_preds + + result = {} + for metric in self.eval_task.metric: + result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id)) + + average_metric = sum(result.values())/len(result) + result.update({"average_metrics":average_metric}) + return result \ No newline at end of file diff --git a/examples/examples_prompt/backbones/t5.py b/examples/examples_prompt/backbones/t5.py index 7a6edf0..15e7f21 100644 --- a/examples/examples_prompt/backbones/t5.py +++ b/examples/examples_prompt/backbones/t5.py @@ -26,14 +26,13 @@ def preprocess_function(raw_example, **kwargs): example = InputExample(**raw_example) - try: - example = verbalizer.wrap_one_example(example) - example, other = template.wrap_one_example(example) - input_sentence = tokenizer_wrapper.merge_wrapped_example(example) - model_inputs = tokenizer(input_sentence, max_length=256, - padding="max_length", truncation=True) - except: - from IPython import embed; embed(header="Therer") + + example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=256, + padding="max_length", truncation=True) + with tokenizer.as_target_tokenizer(): label = tokenizer(other['tgt_text']).input_ids diff --git a/examples/examples_seq2seq/__init__.py b/examples/examples_prompt/backbones/vit.py similarity index 100% rename from examples/examples_seq2seq/__init__.py rename to examples/examples_prompt/backbones/vit.py diff --git a/examples/examples_prompt/collect_result.jsonl b/examples/examples_prompt/collect_result.jsonl deleted file mode 100644 index 990a2d9..0000000 --- a/examples/examples_prompt/collect_result.jsonl +++ /dev/null @@ -1,59 +0,0 @@ -# the final results will be populated here.{ - "evaluate": { - "epoch": 20.0, - "eval_accuracy": 89.2156862745098, - "eval_average_metrics": 90.76168929110105, - "eval_f1": 92.3076923076923, - "eval_loss": 0.16493959724903107, - "eval_runtime": 1.6391, - "eval_samples_per_second": 124.455 - }, - "repo_name": "DeltaHub/bitfit_t5-base_mrpc", - "test": { - "epoch": 20.0, - "test_accuracy": 88.23529411764706, - "test_average_metrics": 89.97971602434077, - "test_f1": 91.72413793103448, - "test_loss": 0.14968213438987732, - "test_runtime": 1.6344, - "test_samples_per_second": 124.82 - } -} -{ - "evaluate": { - "epoch": 20.0, - "eval_average_metrics": 52.10265668831534, - "eval_loss": 0.3603779077529907, - "eval_matthews_correlation": 52.10265668831534, - "eval_runtime": 1.0808, - "eval_samples_per_second": 482.046 - }, - "repo_name": "DeltaHub/bitfit_t5-base_cola", - "test": { - "epoch": 20.0, - "test_average_metrics": 54.209563471221934, - "test_loss": 0.2853100299835205, - "test_matthews_correlation": 54.209563471221934, - "test_runtime": 1.056, - "test_samples_per_second": 494.304 - } -} -{ - "evaluate": { - "epoch": 20.0, - "eval_average_metrics": 53.80613287067274, - "eval_loss": 0.25723716616630554, - "eval_matthews_correlation": 53.80613287067274, - "eval_runtime": 1.0583, - "eval_samples_per_second": 492.299 - }, - "repo_name": "DeltaHub/bitfit_t5-base_cola", - "test": { - "epoch": 20.0, - "test_average_metrics": 54.32497579543861, - "test_loss": 0.22327613830566406, - "test_matthews_correlation": 54.32497579543861, - "test_runtime": 1.0556, - "test_samples_per_second": 494.507 - } -} diff --git a/examples/examples_prompt/configs/adapter_clip-vit-base-patch32/beans.json b/examples/examples_prompt/configs/adapter_clip-vit-base-patch32/beans.json new file mode 100644 index 0000000..5f46495 --- /dev/null +++ b/examples/examples_prompt/configs/adapter_clip-vit-base-patch32/beans.json @@ -0,0 +1,48 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "beans", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32", + "num_classes": 3, + "num_train_epochs": 20, + "output_dir": "outputs/adapter/clip-vit-base-patch32/beans", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_delta_center": true, + "push_to_hub": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "beans", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "beans", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/adapter_opt-350m/wikitext.json b/examples/examples_prompt/configs/adapter_opt-350m/wikitext.json new file mode 100644 index 0000000..af141ff --- /dev/null +++ b/examples/examples_prompt/configs/adapter_opt-350m/wikitext.json @@ -0,0 +1,53 @@ +{ + "backbone_model": "opt", + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "wikitext", + "eval_steps": 200, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":2, + "greater_is_better": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 900, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m", + "model_path_public": "opt-350m", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/opt-350m/wikitext", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 6, + "per_device_train_batch_size": 6, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "wikitext", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "wikitext", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["self_attn"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/adapter_vit-large-patch16-224-in21k/beans.json b/examples/examples_prompt/configs/adapter_vit-large-patch16-224-in21k/beans.json new file mode 100644 index 0000000..ff7551a --- /dev/null +++ b/examples/examples_prompt/configs/adapter_vit-large-patch16-224-in21k/beans.json @@ -0,0 +1,53 @@ +{ + "backbone_model": "vit", + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": false, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "beans", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k", + "model_path_public": "vit-large-patch16-224-in21k", + "num_classes": 3, + "num_train_epochs": 20, + "output_dir": "outputs/adapter/vit-large-patch16-224-in21k/beans", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "beans", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "beans", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["output"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/bitfit_t5-large/rte.json b/examples/examples_prompt/configs/bitfit_t5-large/rte.json new file mode 100644 index 0000000..04e7f77 --- /dev/null +++ b/examples/examples_prompt/configs/bitfit_t5-large/rte.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "t5-large", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/t5-large", + "model_path_public": "t5-large", + "num_train_epochs": 20, + "output_dir": "outputs/bitfit/t5-large/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/t5-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attn", "ff", "layer_norm"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/compacter_blenderbot-3b/sst2.json b/examples/examples_prompt/configs/compacter_blenderbot-3b/sst2.json new file mode 100644 index 0000000..2862f6e --- /dev/null +++ b/examples/examples_prompt/configs/compacter_blenderbot-3b/sst2.json @@ -0,0 +1,66 @@ +{ + "backbone_model": "blenderbot", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "compacter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "sst2", + "eval_steps": 200, + "evaluation_strategy": "steps", + "factorized_phm": true, + "factorized_phm_rule": false, + "gradient_clip": false, + "greater_is_better": true, + "hypercomplex_adapters": true, + "hypercomplex_division": 4, + "hypercomplex_nonlinearity": "glorot-uniform", + "learn_phm": true, + "learning_rate": 0.003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b", + "model_path_public": "blenderbot-3b", + "non_linearity": "gelu_new", + "normalize_phm_weight": false, + "num_train_epochs": 3, + "output_dir": "outputs/compacter/blenderbot-3b/sst2", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "phm_c_init": "normal", + "phm_clamp": false, + "phm_init_range": 0.0001, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "shared_phm_rule": false, + "split_validation_test": true, + "task_name": "sst2", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "sst2", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "use_bias_down_sampler": true, + "use_bias_up_sampler": true, + "warmup_steps": 0, + "modified_modules":["fc2"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/compacter_deberta-v2-xlarge/mnli.json b/examples/examples_prompt/configs/compacter_deberta-v2-xlarge/mnli.json new file mode 100644 index 0000000..23c38d7 --- /dev/null +++ b/examples/examples_prompt/configs/compacter_deberta-v2-xlarge/mnli.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "deberta-v2-xlarge", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "compacter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mnli", + "eval_steps": 500, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge", + "num_train_epochs": 3, + "output_dir": "outputs/compacter/deberta-v2-xlarge/mnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hub": false, + "save_steps": 500, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "mnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mnli", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attention"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/compacter_long-t5-tglobal-large/rte.json b/examples/examples_prompt/configs/compacter_long-t5-tglobal-large/rte.json new file mode 100644 index 0000000..eb3d7c1 --- /dev/null +++ b/examples/examples_prompt/configs/compacter_long-t5-tglobal-large/rte.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "long-t5", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "compacter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large", + "model_path_public": "long-t5-tglobal-large", + "num_train_epochs": 20, + "output_dir": "outputs/compacter/long-t5-tglobal-large/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attn", "ff", "layer_norm"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/gen_bart.py b/examples/examples_prompt/configs/gen_bart.py index 0008afc..ec5a2f0 100644 --- a/examples/examples_prompt/configs/gen_bart.py +++ b/examples/examples_prompt/configs/gen_bart.py @@ -71,8 +71,21 @@ AllConfigs['adapter_bart-base'].update({ "output_dir": "outputs/adapter/bart-base/", }) -AllConfigs['lora_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) -AllConfigs['lora_bart-base'].update({ +AllConfigs['parallel_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['parallel_adapter_t5-base'].update({ + "delta_type": "parallel_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/parallel_adapter/t5-base/", + }) + +AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['lora_t5-base'].update({ "delta_type": "lora", "learning_rate": 3e-4, "unfrozen_modules": [ diff --git a/examples/examples_prompt/configs/gen_clip.py b/examples/examples_prompt/configs/gen_clip.py index e7cb94d..41a59c5 100644 --- a/examples/examples_prompt/configs/gen_clip.py +++ b/examples/examples_prompt/configs/gen_clip.py @@ -2,7 +2,7 @@ import collections import copy PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" -PATHBASE="/home/hushengding/plm_cache/" +# PATHBASE="/home/hushengding/plm_cache/" AllConfigs = {} diff --git a/examples/examples_prompt/configs/gen_t5.py b/examples/examples_prompt/configs/gen_t5.py index 8876197..7040fb6 100644 --- a/examples/examples_prompt/configs/gen_t5.py +++ b/examples/examples_prompt/configs/gen_t5.py @@ -45,11 +45,14 @@ BaseConfigs['t5-base'] = { "greater_is_better": True, "evaluation_strategy": "steps", "overwrite_output_dir": True, - "push_to_hub": False, - "push_to_delta_center": True, + "push_to_hf": False, + "push_to_dc": True, "save_strategy": "steps", "datasets_load_from_disk": True, - "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/" + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "backbone_model": "t5", # use in delta center, + "model_path_public": "t5-base", # use in delta center, + } AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) diff --git a/examples/examples_prompt/configs/lora_beit-large-patch16-224/cifar10.json b/examples/examples_prompt/configs/lora_beit-large-patch16-224/cifar10.json new file mode 100644 index 0000000..1a4d789 --- /dev/null +++ b/examples/examples_prompt/configs/lora_beit-large-patch16-224/cifar10.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "beit", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "cifar10", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224", + "model_path_public": "beit-large-patch16-224", + "num_classes": 10, + "num_train_epochs": 20, + "output_dir": "outputs/lora/beit-large-patch16-224/cifar10", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "cifar10", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "cifar10", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["query","value"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/lora_gpt-j-6B/wikitext.json b/examples/examples_prompt/configs/lora_gpt-j-6B/wikitext.json new file mode 100644 index 0000000..11ebfde --- /dev/null +++ b/examples/examples_prompt/configs/lora_gpt-j-6B/wikitext.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "gpt-j", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "wikitext", + "eval_steps": 500, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":4, + "greater_is_better": false, + "learning_rate": 0.00003, + "load_best_model_at_end": true, + "max_source_length": 512, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B", + "model_path_public": "gpt-j-6B", + "num_train_epochs": 2, + "output_dir": "outputs/lora/gpt-j-6B/wikitext", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 2, + "per_device_train_batch_size": 2, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 500, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "wikitext", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "wikitext", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["20.attn.q_proj","21.attn.q_proj","22.attn.q_proj","23.attn.q_proj","24.attn.q_proj","25.attn.q_proj","26.attn.q_proj","27.attn.q_proj"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/lora_roberta-large/superglue-boolq.json b/examples/examples_prompt/configs/lora_roberta-large/superglue-boolq.json new file mode 100644 index 0000000..9ef9cff --- /dev/null +++ b/examples/examples_prompt/configs/lora_roberta-large/superglue-boolq.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "roberta-large", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0001, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large", + "model_path_public": "roberta-large", + "num_train_epochs": 20, + "output_dir": "outputs/lora/roberta-large/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_hub": false, + "push_to_dc": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["query","value"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/lora_xlm-roberta-large/superglue-wic.json b/examples/examples_prompt/configs/lora_xlm-roberta-large/superglue-wic.json new file mode 100644 index 0000000..35a42f1 --- /dev/null +++ b/examples/examples_prompt/configs/lora_xlm-roberta-large/superglue-wic.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "xlm-roberta-large", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wic", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large", + "model_path_public": "xlm-roberta-large", + "num_train_epochs": 20, + "output_dir": "outputs/lora/xlm-roberta-large/superglue-wic", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hub": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "superglue-wic", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wic", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["query","value"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/low_rank_adapter_gpt2/wikitext.json b/examples/examples_prompt/configs/low_rank_adapter_gpt2/wikitext.json new file mode 100644 index 0000000..3a60852 --- /dev/null +++ b/examples/examples_prompt/configs/low_rank_adapter_gpt2/wikitext.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "gpt2", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "low_rank_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "wikitext", + "eval_steps": 200, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":1, + "greater_is_better": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 768, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt2", + "model_path_public": "gpt2", + "num_train_epochs": 2, + "output_dir": "outputs/low_rank_adapter/gpt2/wikitext", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "wikitext", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "wikitext", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt2", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attn","mlp"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/prefix_bert-large-cased/rte.json b/examples/examples_prompt/configs/prefix_bert-large-cased/rte.json new file mode 100644 index 0000000..5d67563 --- /dev/null +++ b/examples/examples_prompt/configs/prefix_bert-large-cased/rte.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "bert-large-cased", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/bert-large-cased/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hub": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attention"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/soft_prompt_bart-large/superglue-boolq.json b/examples/examples_prompt/configs/soft_prompt_bart-large/superglue-boolq.json new file mode 100644 index 0000000..19cbbba --- /dev/null +++ b/examples/examples_prompt/configs/soft_prompt_bart-large/superglue-boolq.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "bart", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "soft_prompt", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 500, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":1, + "greater_is_better": true, + "learning_rate": 0.1, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bart-large", + "model_path_public": "bart-large", + "num_train_epochs": 50, + "output_dir": "outputs/soft_prompt/bart-large/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 500, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "soft_token_num":100, + "split_validation_test": true, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bart-large", + "token_init": true, + "unfrozen_modules": [ + "deltas" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_prompt/data_processors/processor.py b/examples/examples_prompt/data_processors/processor.py index 035bc5d..9986100 100644 --- a/examples/examples_prompt/data_processors/processor.py +++ b/examples/examples_prompt/data_processors/processor.py @@ -93,4 +93,10 @@ class AbstractTask(abc.ABC): # shuffles the data and samples it. if n_obs is not None: dataset = self.subsample(dataset, n_obs) - return dataset.map(self.preprocessor) + + this_method = getattr(self.__class__, 'preprocessor') + base_method = getattr(AbstractTask, 'preprocessor') + if this_method is not base_method: + return dataset.map(self.preprocessor) + else: + return dataset diff --git a/examples/examples_prompt/data_processors/tasks.py b/examples/examples_prompt/data_processors/tasks.py index aee5478..7d0402a 100644 --- a/examples/examples_prompt/data_processors/tasks.py +++ b/examples/examples_prompt/data_processors/tasks.py @@ -12,22 +12,16 @@ import logging import numpy as np import torch import re -from openprompt.prompts import ManualTemplate, ManualVerbalizer -from openprompt.plms.utils import TokenizerWrapper -from openprompt.data_utils import InputExample -from openprompt.prompts import GenerationVerbalizer import itertools - +import os logger = logging.getLogger(__name__) - from transformers.models.auto.tokenization_auto import tokenizer_class_from_name from typing import List, Dict from collections import defaultdict -from openprompt.utils import round_list import warnings @@ -68,7 +62,8 @@ class COLA(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.cola")[split] else: return datasets.load_dataset('glue', 'cola', @@ -96,7 +91,8 @@ class SST2(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.sst2")[split] else: return datasets.load_dataset('glue', 'sst2', @@ -123,10 +119,9 @@ class MRPC(AbstractTask): } - - def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.mrpc")[split] else: return datasets.load_dataset('glue', 'mrpc', split=split, script_version="master") @@ -152,7 +147,8 @@ class QQP(AbstractTask): def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.qqp")[split] else: return datasets.load_dataset('glue', 'qqp', @@ -208,7 +204,8 @@ class MNLI(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.mnli")[split] else: return datasets.load_dataset('glue', 'mnli', split=split, script_version="master") @@ -243,7 +240,8 @@ class QNLI(AbstractTask): def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.qnli")[split] else: return datasets.load_dataset('glue', 'qnli', split=split, script_version="master") @@ -279,7 +277,8 @@ class RTE(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.rte")[split] else: return datasets.load_dataset('glue', 'rte', @@ -306,7 +305,8 @@ class WNLI(AbstractTask): def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.wnli")[split] else: return datasets.load_dataset('glue', 'wnli', split=split, script_version="master") @@ -334,7 +334,8 @@ class SuperGLUEBoolQ(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.boolq")[split] else: return datasets.load_dataset('super_glue', 'boolq', split=split, script_version="master") @@ -347,8 +348,8 @@ class SuperGLUECB(AbstractTask): split_to_data_split = {"train": "train", "validation": "validation", "test": "validation"} - metric = [metrics.mean_multiclass_f1(num_classes=3), metrics.accuracy] - metric_names = ["f1_multiclass", "accuracy"] + metric = [metrics.accuracy] + metric_names = ["accuracy"] verbalizers = { "0":{"0": "yes", @@ -361,7 +362,8 @@ class SuperGLUECB(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.cb")[split] else: return datasets.load_dataset('super_glue', 'cb', split=split, script_version="master") @@ -387,7 +389,8 @@ class SuperGLUECOPA(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.copa")[split] else: return datasets.load_dataset('super_glue', 'copa', split=split, script_version="master") @@ -416,7 +419,8 @@ class SuperGLUEMultiRC(AbstractTask): def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.multirc")[split] else: return datasets.load_dataset('super_glue', 'multirc', split=split, script_version="master") @@ -459,7 +463,8 @@ class SuperGLUEWIC(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.wic")[split] else: return datasets.load_dataset('super_glue', 'wic', split=split, script_version="master") @@ -549,13 +554,76 @@ class Beans(AbstractTask): def load_dataset(self, split): # from IPython import embed; embed(header="beans") - if self.data_args.datasets_load_from_disk: - return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.wic")[split] + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/beans")[split] else: return datasets.load_dataset('beans', split=split, script_version="master") +class Wikitext(AbstractTask): + #wikitext-2-v1 + name = "wikitext" + # labels_list = ['angular_leaf_spot', 'bean_rust', "healthy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.perplexity] + metric_names = ["perplexity"] + verbalizers = { + "0": { + } + } + templates_text = { + "0": """{"meta":"text"}""" + } + split_valid_to_make_test = True + def load_dataset(self, split): + # from IPython import embed; embed(header="beans") + if self.data_args.datasets_load_from_disk: + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/wikitext")[split] + else: + return datasets.load_dataset('wikitext','wikitext-2-v1', split=split, script_version="master") + +class Cifar10(AbstractTask): + name = "cifar10" + + split_to_data_split = {"train": "train", + "validation": "test", + "test": "test"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + if self.data_args.datasets_load_from_disk: + d = datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/cifar10")[split].select(range(100)) + print(d) + return d + else: + return datasets.load_dataset('cifar10', split=split, script_version="master") + # def preprocessor(self, example): + # example_ = {} + # example_["image"] = example["image"] + # example_["labels"] = example["label"] + + # return example_ +class Fashion_MNIST(AbstractTask): + name = "Fashion-MNIST" + + split_to_data_split = {"train": "train", + "validation": "test", + "test": "test"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + if self.data_args.datasets_load_from_disk: + d = datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/fashion_mnist")[split] + print(d) + return d + else: + return datasets.load_dataset('fashion_mnist', split=split, script_version="master") TASK_MAPPING = OrderedDict( [ @@ -575,7 +643,10 @@ TASK_MAPPING = OrderedDict( ('superglue-multirc', SuperGLUEMultiRC), ('superglue-wic', SuperGLUEWIC), # ('superglue-record', SuperGLUERecord) - ('beans', Beans) + ('beans', Beans), + ('wikitext',Wikitext), + ('cifar10',Cifar10), + ('fashion_mnist',Fashion_MNIST) ] ) diff --git a/examples/examples_prompt/metrics/metrics.py b/examples/examples_prompt/metrics/metrics.py index b9c7cb0..94267b0 100644 --- a/examples/examples_prompt/metrics/metrics.py +++ b/examples/examples_prompt/metrics/metrics.py @@ -11,6 +11,14 @@ import sklearn.metrics logger = getLogger(__name__) +def perplexity(outputs, targets,ignore_index=-100): + """Computes the perplexity accuracy.""" + + ce = -np.log(outputs).mean() + # ce = F.cross_entropy(torch.Tensor(outputs).view(-1, outputs.shape[-1]), torch.Tensor(targets).view(-1).long(),ignore_index=ignore_index) + + return {"perplexity":float(np.exp(ce))} + def accuracy(predictions, targets) -> dict: """Computes the average accuracy.""" return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())} @@ -47,20 +55,20 @@ def spearman_corrcoef(predictions, targets) -> dict: -def spearman_corrcoef(predictions, targets) -> dict: - """Computes Spearman correlation coefficient.""" - # TODO: we need to do postprocessors in a clean way for each dataset. - from examples_seq2seq.data_processors.postprocessors import string_to_float - targets = [string_to_float(target) for target in targets] - predictions= [string_to_float(prediction) for prediction in predictions] - spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0] +# def spearman_corrcoef(predictions, targets) -> dict: +# """Computes Spearman correlation coefficient.""" +# # TODO: we need to do postprocessors in a clean way for each dataset. +# from examples_seq2seq.data_processors.postprocessors import string_to_float +# targets = [string_to_float(target) for target in targets] +# predictions= [string_to_float(prediction) for prediction in predictions] +# spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0] - # Note that if all the predictions will be the same, spearman - # correlation is nan, to gaurad against this, we check the output - # and return 0 in this case. - if math.isnan(spearman_corrcoef): - spearman_corrcoef = 0 - return {"spearmanr": spearman_corrcoef} +# # Note that if all the predictions will be the same, spearman +# # correlation is nan, to gaurad against this, we check the output +# # and return 0 in this case. +# if math.isnan(spearman_corrcoef): +# spearman_corrcoef = 0 +# return {"spearmanr": spearman_corrcoef} def f1_score_with_invalid(predictions, targets) -> dict: @@ -102,8 +110,8 @@ def f1_score(predictions, targets) -> dict: Returns: F1 score, where any prediction != 0 or 1 is counted as wrong. """ - targets = targets.astype(np.int32) - predictions = predictions.astype(np.int32) + targets = np.array(targets).astype(np.int32) + predictions = np.array(predictions).astype(np.int32) return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)} # TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow diff --git a/examples/examples_prompt/src/run.py b/examples/examples_prompt/src/run.py index 5d29700..aca5210 100644 --- a/examples/examples_prompt/src/run.py +++ b/examples/examples_prompt/src/run.py @@ -26,10 +26,12 @@ You can also adapt this script on your own tasks. import os import sys + os.environ['MKL_THREADING_LAYER'] = 'GNU' os.environ['MKL_SERVICE_FORCE_INTEL'] = '1' os.environ["TOKENIZERS_PARALLELISM"] = "false" sys.path.append(os.path.join(os.getcwd(), "../")) +# sys.path.append(os.path.join(os.getcwd(), "/mnt/sfs_turbo/zhangzhen/OpenDelta")) sys.path.append(os.path.join(os.getcwd())) import functools @@ -56,7 +58,7 @@ from transformers.trainer_utils import is_main_process, get_last_checkpoint from data_processors import AutoTask #, #TaskDataCollatorForSeq2Seq, AutoPostProcessor, data_collator from utils import read_json, save_json -from utils.args import ModelArguments, TrainingArguments, DataTrainingArguments, RemainArgHfArgumentParser +from utils.args import ModelArguments, TrainingArguments, DataTrainingArguments, DeltaArguments, RemainArgHfArgumentParser logger = logging.getLogger(__name__) @@ -66,16 +68,14 @@ def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args, delta_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args, delta_args = parser.parse_args_into_dataclasses(return_remaining_strings=True) + parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, DeltaArguments)) + # You can provide a json file with contains the arguments and use the --argument some_arg to override or append to the json file. + json_file, cmd_args = (os.path.abspath(sys.argv[1]), sys.argv[2:]) if sys.argv[1].endswith(".json") else (None, sys.argv[1:]) + model_args, data_args, training_args, delta_args, remain_args = parser.parse_json_file_with_cmd_args(json_file=json_file, command_line_args=cmd_args) + logger.warning("The following arguments not used! {}".format(remain_args)) - print(f"{training_args.output_dir}/results.json") + logger.info(f"The results will be used in {training_args.output_dir}/results.json") # exit() # Detecting last checkpoint. last_checkpoint = None @@ -121,7 +121,8 @@ def main(): - if os.path.basename(model_args.model_name_or_path).startswith("t5"): + if os.path.basename(model_args.model_name_or_path).startswith("t5") \ + or os.path.basename(model_args.model_name_or_path).startswith("long-t5") : from examples_prompt.backbones.t5 import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts from examples_prompt.backbones.t5 import Trainer, DataCollator elif os.path.basename(model_args.model_name_or_path).startswith("blenderbot"): @@ -129,7 +130,9 @@ def main(): from examples_prompt.backbones.blenderbot import Trainer, DataCollator elif os.path.basename(model_args.model_name_or_path).startswith("roberta") \ or os.path.basename(model_args.model_name_or_path).startswith("bert") \ - or os.path.basename(model_args.model_name_or_path).startswith("albert") : + or os.path.basename(model_args.model_name_or_path).startswith("albert") \ + or os.path.basename(model_args.model_name_or_path).startswith("xlm-roberta") \ + or os.path.basename(model_args.model_name_or_path).startswith("deberta") : from examples_prompt.backbones.bert import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts from examples_prompt.backbones.bert import Trainer, DataCollator elif os.path.basename(model_args.model_name_or_path).startswith("beit"): @@ -144,6 +147,10 @@ def main(): elif os.path.basename(model_args.model_name_or_path).startswith("clip"): from examples_prompt.backbones.clip import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts from examples_prompt.backbones.clip import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("opt") \ + or os.path.basename(model_args.model_name_or_path).startswith("gpt"): + from examples_prompt.backbones.opt import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.opt import Trainer, DataCollator @@ -161,7 +168,8 @@ def main(): if delta_args.delta_type.lower() != "none": from opendelta import AutoDeltaConfig,AutoDeltaModel - delta_config = AutoDeltaConfig.from_dict(vars(delta_args)) + from dataclasses import asdict + delta_config = AutoDeltaConfig.from_dict(asdict(delta_args)) delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=model) delta_model.freeze_module(set_state_dict = True) delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True) @@ -278,14 +286,9 @@ def main(): if torch.cuda.is_available() and training_args.compute_memory: peak_memory = (torch.cuda.max_memory_allocated() / 1024 ** 2)/1000 - print( - "Memory utilization", - peak_memory, - "GB" - ) performance_metrics.update({"peak_memory": peak_memory}) if training_args.compute_memory or training_args.compute_time: - print("Efficiency Statistics {}".format(performance_metrics)) + logger.info("Efficiency Statistics {}".format(performance_metrics)) trainer.save_metrics("performance", performance_metrics) # Evaluation @@ -313,17 +316,30 @@ def main(): trainer.save_metrics(f"{data_args.task_name}_test", metrics) all_results['test'][data_args.task_name] = metrics + # from opendelta.utils.delta_hub import create_hub_repo_name + # from opendelta.utils.delta_center import create_delta_center_args, create_repo_name + # repo_name = create_hub_repo_name(root="DeltaHub", # dataset=data_args.task_name, # delta_type = delta_args.delta_type, # model_name_or_path= model_args.model_name_or_path) - # results['repo_name'] = repo_name - # if delta_args.delta_type.lower() != "none": - # if training_args.push_to_hub: # TODO add description here - # delta_model.save_finetuned(push_to_hub=True, save_directory=repo_name, use_auth_token=True) - # # trainer.push_to_hub(**kwargs) - # else: - # delta_model.save_finetuned(push_to_hub=False, save_directory=repo_name, use_auth_token=True) + + # center_args = + # repo_name = create_repo_name(prefix="", center_args=center_args) + # all_results['repo_name'] = repo_name + + + delta_model.save_finetuned(finetuned_delta_path=delta_args.finetuned_delta_path, + push_to_dc=training_args.push_to_dc, + center_args={"test_performance":all_results['test'][data_args.task_name]['test_average_metrics'], + }, + center_args_pool = {**vars(model_args), **vars(data_args), **vars(training_args), **vars(delta_args)}, + list_tags = ['NLI'], + dict_tags = {'purpose':'for testing'}, + delay_push=True, + test_result=all_results['test'] + ) + with open(f"{training_args.output_dir}/results.json", 'w') as fout: diff --git a/examples/examples_prompt/src/test.py b/examples/examples_prompt/src/test.py new file mode 100644 index 0000000..fb17494 --- /dev/null +++ b/examples/examples_prompt/src/test.py @@ -0,0 +1,344 @@ +# coding=utf-8 +# Copyright OpenDelta Team and THUNLP lab. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +A unified runing scripts for most models to do down stream tasks in a +prompt learning fashion, i.e., No classification head, all tasks are casted +to mask prediction or span prediction tasks. + +Processing relevant to different backbone models are stored in ../backbones/ + +Adding A few lines to integrate the Delta tuning methods. + +You can also adapt this script on your own tasks. +""" + +import os +import sys +os.environ['MKL_THREADING_LAYER'] = 'GNU' +os.environ['MKL_SERVICE_FORCE_INTEL'] = '1' +os.environ["TOKENIZERS_PARALLELISM"] = "false" +sys.path.append(os.path.join(os.getcwd(), "../")) +sys.path.append(os.path.join(os.getcwd())) + +import functools +import logging +import torch +import json +import numpy as np + +import transformers +from transformers import ( + AutoConfig, + AutoModelForMaskedLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + DataCollatorForSeq2Seq, + # HfArgumentParser, + # MBartTokenizer, + # default_data_collator, + Trainer, + Seq2SeqTrainer, + set_seed, +) +from transformers.trainer_utils import is_main_process, get_last_checkpoint + +from data_processors import AutoTask #, #TaskDataCollatorForSeq2Seq, AutoPostProcessor, data_collator +from utils import read_json, save_json +from utils.args import ModelArguments, TrainingArguments, DataTrainingArguments, RemainArgHfArgumentParser, DeltaArguments + + +logger = logging.getLogger(__name__) + + +def main(): + parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, DeltaArguments)) + + # You can provide a json file with contains the arguments and use the --argument some_arg to override or append to the json file. + json_file, cmd_args = (os.path.abspath(sys.argv[1]), sys.argv[2:]) if sys.argv[1].endswith(".json") else (None, sys.argv[1:]) + model_args, data_args, training_args, delta_args, remain_args = parser.parse_json_file_with_cmd_args(json_file=json_file, command_line_args=cmd_args) + logger.warning("The following arguments not used! {}".format(remain_args)) + + # # exit() + # # Detecting last checkpoint. + # last_checkpoint = None + # if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + # last_checkpoint = get_last_checkpoint(training_args.output_dir) + # print("#### last_checkpoint ", last_checkpoint) + # if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + # ''' + # raise ValueError( + # f"Output directory ({training_args.output_dir}) already exists and is not empty. " + # "Use --overwrite_output_dir to overcome." + # ) + # ''' + # pass + # elif last_checkpoint is not None: + # logger.info( + # f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + # "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + # ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + # Set the verbosity to info of the Transformers logger (on main process only): + if is_main_process(training_args.local_rank): + transformers.utils.logging.set_verbosity_info() + # logger.info("Training/evaluation parameters %s", training_args, model_args, data_args, delta_args) + logger.info("{}\n{}\n{}\n{}".format(training_args, model_args, data_args, delta_args)) + + + # Set seed before initializing model. + set_seed(training_args.seed) + + + + if os.path.basename(model_args.model_name_or_path).startswith("t5"): + from examples_prompt.backbones.t5 import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.t5 import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("blenderbot"): + from examples_prompt.backbones.blenderbot import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.blenderbot import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("roberta") \ + or os.path.basename(model_args.model_name_or_path).startswith("bert") \ + or os.path.basename(model_args.model_name_or_path).startswith("albert") : + from examples_prompt.backbones.bert import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.bert import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("beit"): + from examples_prompt.backbones.beit import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.beit import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("bart"): + from examples_prompt.backbones.bart import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.bart import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("bigbird"): + from examples_prompt.backbones.bigbird import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.bigbird import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("clip"): + from examples_prompt.backbones.clip import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.clip import Trainer, DataCollator + + + + config, tokenizer, model = get_backbone(model_args=model_args) + + # model parallelize + if hasattr(training_args, "model_parallel") and training_args.model_parallel: + logger.info('parallelize model!') + model.parallelize() + + from opendelta import Visualization + Visualization(model).structure_graph() + + if delta_args.delta_type.lower() != "none": + from opendelta.delta_models.adapter import AdapterConfig, AdapterModel + delta_config = AdapterConfig.from_finetuned(finetuned_delta_path=delta_args.finetuned_delta_path) + delta_model = AdapterModel.from_finetuned(finetuned_delta_path=delta_args.finetuned_delta_path, + delta_config=delta_config, + backbone_model=model, + force_download=delta_args.force_download, + cache_dir=delta_args.delta_cache_dir) + # delta_model.freeze_module(set_state_dict = True) + delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True) + + + performance_metrics = {} + + + + + non_empty_splits_names = [] + # if training_args.do_train: + # non_empty_splits_names.append("train") + # if training_args.do_eval: + # non_empty_splits_names.append("eval") + if training_args.do_test: + non_empty_splits_names.append("test") + splits = {} + for split_name in ['test']: + if split_name not in non_empty_splits_names: + splits[split_name] = None + continue + + task = AutoTask.get(data_args.task_name, + data_args.dataset_config_name, + data_args=data_args, + seed=data_args.data_sample_seed) + + dataset = task.get(split=split_name, + split_validation_test=training_args.split_validation_test, + n_obs=data_args.max_train_samples) + + + + template, _verbalizer, tokenizer_wrapper = get_prompts(task, tokenizer, data_args) + + + dataset = dataset.map( + functools.partial(preprocess_function, + data_args=data_args, + tokenizer=tokenizer, + template=template, + verbalizer=_verbalizer, + tokenizer_wrapper=tokenizer_wrapper, + split=split_name), + batched=False, + num_proc=data_args.preprocessing_num_workers, + remove_columns=get_remove_columns(list(dataset.features.keys())), + load_from_cache_file=not data_args.overwrite_cache, + ) + # from IPython import embed; embed() + splits[split_name] = dataset + if split_name == "test": + eval_task = task + verbalizer = _verbalizer + + + + trainer = Trainer( + model=model, + verbalizer=verbalizer, + eval_task=eval_task, + args=training_args, + # train_dataset=splits['train'], + # eval_dataset=splits['eval'], + tokenizer=tokenizer, + data_collator=DataCollator(tokenizer), + ) + + + def save_training_config(config_file, output_dir): + json_data = read_json(config_file) + save_json(os.path.join(output_dir, "training_config.json"), json_data) + + + # Saves training config. + if trainer.is_world_process_zero(): + save_training_config(sys.argv[1], training_args.output_dir) + + # # Training + # if training_args.do_train: + # checkpoint = None + # if training_args.resume_from_checkpoint is not None: + # checkpoint = training_args.resume_from_checkpoint + # elif last_checkpoint is not None: + # checkpoint = last_checkpoint + + # if training_args.compute_time: + # torch.cuda.synchronize() # wait for move to complete + # start = torch.cuda.Event(enable_timing=True) + # end = torch.cuda.Event(enable_timing=True) + # start.record() + + # train_result = trainer.train(resume_from_checkpoint=checkpoint) + + # if training_args.compute_time: + # end.record() + # torch.cuda.synchronize() # wait for all_reduce to complete + # total_time = start.elapsed_time(end)/(1000*60) + # performance_metrics.update({"total_time in minutes ": total_time}) + + # trainer.save_model() # Saves the tokenizer too for easy upload + # train_metrics = train_result.metrics + # max_train_samples = ( + # data_args.max_train_samples if data_args.max_train_samples is not None else len(splits['train']) + # ) + # train_metrics["train_samples"] = min(max_train_samples, len(splits['train'])) + # trainer.log_metrics("train", train_metrics) + # trainer.save_metrics("train", train_metrics) + # trainer.save_state() + + # if torch.cuda.is_available() and training_args.compute_memory: + # peak_memory = (torch.cuda.max_memory_allocated() / 1024 ** 2)/1000 + # print( + # "Memory utilization", + # peak_memory, + # "GB" + # ) + # performance_metrics.update({"peak_memory": peak_memory}) + # if training_args.compute_memory or training_args.compute_time: + # print("Efficiency Statistics {}".format(performance_metrics)) + # trainer.save_metrics("performance", performance_metrics) + + # Evaluation + all_results = {} + + # all_results['evaluate'] = {} + + # if training_args.do_eval: + # logger.info("*** Evaluate ***") + + # metrics = trainer.evaluate(eval_dataset=splits['eval'], + # ) + # trainer.log_metrics(f"{data_args.task_name}_eval", metrics) + # trainer.save_metrics(f"{data_args.task_name}_eval", metrics) + # all_results['evaluate'][data_args.task_name] = metrics + + # Test + all_results['test'] = {} + if training_args.do_test: + logger.info("*** Test ***") + metrics = trainer.evaluate(eval_dataset=splits['test'], + metric_key_prefix="test" + ) + trainer.log_metrics(f"{data_args.task_name}_test", metrics) + trainer.save_metrics(f"{data_args.task_name}_test", metrics) + all_results['test'][data_args.task_name] = metrics + + # from opendelta.utils.delta_hub import create_hub_repo_name + # from opendelta.utils.delta_center import create_delta_center_args, create_repo_name + + # repo_name = create_hub_repo_name(root="DeltaHub", + # dataset=data_args.task_name, + # delta_type = delta_args.delta_type, + # model_name_or_path= model_args.model_name_or_path) + + # center_args = + # repo_name = create_repo_name(prefix="", center_args=center_args) + # all_results['repo_name'] = repo_name + + + # delta_model.save_finetuned(push_to_hf=training_args.push_to_hf, + # push_to_dc=training_args.push_to_dc, + # center_args={}, + # center_args_pool = {**vars(model_args), **vars(data_args), **vars(training_args), **vars(delta_args)}, + # delay_push=True, + # ) + + print(all_results) + + + + # with open(f"{training_args.output_dir}/results.json", 'w') as fout: + # string = json.dumps(all_results, indent=4,sort_keys=True) + # fout.write(string+"\n") + + return all_results + + + + +if __name__ == "__main__": + result = main() + diff --git a/examples/examples_prompt/utils/args.py b/examples/examples_prompt/utils/args.py index aefec9a..23bdab8 100644 --- a/examples/examples_prompt/utils/args.py +++ b/examples/examples_prompt/utils/args.py @@ -1,6 +1,10 @@ from dataclasses import dataclass, field from typing import Optional, List from transformers import HfArgumentParser +from pathlib import Path +import sys + + @dataclass class ModelArguments: @@ -81,6 +85,10 @@ class TrainingArguments(HfTrainingArguments): remove_unused_columns: Optional[bool] = field( default=False, metadata={"help": "Remove columns not required by the model when using an nlp.Dataset."} ) + push_to_hf: Optional[bool] = field(default=False, metadata={"help": "Push the model to huggingface model hub."}) + push_to_dc: Optional[bool] = field(default=True, metadata={"help": "Push the model to delta center."}) + + @@ -211,28 +219,254 @@ class DataTrainingArguments: self.test_max_target_length = self.max_target_length + +import dataclasses + +@dataclass +class DeltaArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + delta_type: str= field(default="", metadata={"help": "the type of delta"}) + backbone_model: Optional[str] = field( + default="", metadata={"help": "the backbone model"} + ) + model_path_public: Optional[str] = field( + default="", metadata={"help": "the path (url) of the publicly available backbone model"} + ) + modified_modules: Optional[List[str]] = field( + default_factory=lambda: None, metadata={"help": "the modules inside the backbone to be modified"} + ) + unfrozen_modules: Optional[List[str]] = field( + default_factory=lambda:["deltas"], metadata={"help": "the modules inside the backbone or in the delta modules that need to be unfrozen"} + ) + finetuned_delta_path: Optional[str] = field( + default=None, metadata={"help": "the path of the finetuned delta model"} + ) + force_download: Optional[bool] = field( + default=False, metadata={"help": "whether to download the checkpoint form delta center no matter whether it exists"} + ) + local_files_only: Optional[bool] = field( + default=False, metadata={"help": "whether not to look for file in delta center"} + ) + delta_cache_dir: Optional[str] = field( + default=None, metadata={"help": "The cache path defined by user. If not set, we will firstly look into the"+ + " working directory and then into the default cache path (ususally ~/.cache/delta_center)."} + ) + delay_push: Optional[bool] = field( + default=True, metadata={ + 'help':'whether push the checkpoint to delta center later.' + } + ) + + def merge_arguments(self, objb): + print(objb) + self.__class__ = dataclasses.make_dataclass('DeltaArgument', fields=[(s.name, s.type, getattr(objb, s.name)) for s in dataclasses.fields(objb)], bases=(DeltaArguments,)) + + + + +@dataclass +class AdapterArguments: + bottleneck_dim: Optional[int] = field( + default=24, metadata={"help": "the dimension of the bottleneck layer"} + ) +@dataclass +class LoRAArguments: + lora_r: Optional[int] = field( + default=8, metadata={"help": "the rank of the LoRA metrics."} + ) +@dataclass +class PrefixArguments: + pass +@dataclass +class BitFitArguments: + pass +@dataclass +class SoftPromptArguments: + soft_token_num: Optional[int] = field( + default=100, metadata={"help": "the num of soft tokens."} + ) + +@dataclass +class CompacterArguments: + pass +@dataclass +class LowRankAdapterArguments: + pass + +# from opendelta.delta_models.adapter import AdapterConfig +# from opendelta.delta_models.bitfit import BitFitConfig +# from opendelta.delta_models.compacter import CompacterConfig +# from opendelta.delta_models.lora import LoraArguments +# from opendelta.delta_models.low_rank_adapter import LowRankAdapterConfig +# from opendelta.delta_models.prefix import PrefixConfig +# from opendelta.delta_models.soft_prompt import SoftPromptConfig +# DELTAARGMAP = { +# "adapter": AdapterConfig, +# "lora":LoraArguments, +# "prefix":PrefixConfig, +# "bitfit":BitFitConfig, +# "soft_prompt":SoftPromptConfig, +# "compacter":CompacterConfig, +# "low_rank_adapter":LowRankAdapterConfig + +# } + +DELTAARGMAP = { + "adapter": AdapterArguments, + "lora":LoRAArguments, + "prefix":PrefixArguments, + "bitfit":BitFitArguments, + "soft_prompt":SoftPromptArguments, + "compacter":CompacterArguments, + "low_rank_adapter":LowRankAdapterArguments + +} + +# TODO: add more specific delta arguments + + + class RemainArgHfArgumentParser(HfArgumentParser): - def parse_json_file(self, json_file: str, return_remaining_args=True ): + '''This is a more powerful version of argument parser. + It can receiven both command line arguments and json file arguments. + The command line arguments will override the json file arguments. + The parser will load the specific delta arguments (e.g. Adapter's) + according to the delta_type argument. And merge the specific delta arguments + with the common delta arguments. + ''' + def parse_json_file_with_cmd_args(self, json_file: str, command_line_args=None, return_remaining_args=True ): """ Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the dataclass types. """ - import argparse + import json from pathlib import Path - import dataclasses + + data = json.loads(Path(json_file).read_text()) + + + data_str = "" + if command_line_args is None: + command_line_args = [] + for key in data: + if "--"+key not in command_line_args: + if isinstance(data[key], list): + data_str += "--"+key + for elem in data[key]: + data_str+=" "+ str(elem) + data_str += " " + else: + data_str+= "--" + key + " " + str(data[key]) + " " + + data_list = data_str.split() + data_list += command_line_args + + + if return_remaining_args: + outputs, remain_args = self.parse_args_into_dataclasses(args=data_list, return_remaining_strings=return_remaining_args) + for d in outputs: + if isinstance(d, DeltaArguments): # merge the specific delta arguments + d.merge_arguments(outputs[-1]) + + return [*(outputs[:-1]), remain_args] + else: + outputs = self.parse_args_into_dataclasses(args=data_list, return_remaining_strings=return_remaining_args) + for d in outputs: + if isinstance(d, DeltaArguments): + d.merge_arguments(outputs[-1]) + return [*(outputs[:-1]),] + + def parse_args_into_dataclasses( + self, args=None, return_remaining_strings=False, look_for_args_file=True, args_filename=None + ): + """ + Parse command-line args into instances of the specified dataclass types. + + This relies on argparse's `ArgumentParser.parse_known_args`. See the doc at: + docs.python.org/3.7/library/argparse.html#argparse.ArgumentParser.parse_args + + Args: + args: + List of strings to parse. The default is taken from sys.argv. (same as argparse.ArgumentParser) + return_remaining_strings: + If true, also return a list of remaining argument strings. + look_for_args_file: + If true, will look for a ".args" file with the same base name as the entry point script for this + process, and will append its potential content to the command line args. + args_filename: + If not None, will uses this file instead of the ".args" file specified in the previous argument. + + Returns: + Tuple consisting of: + + - the dataclass instances in the same order as they were passed to the initializer.abspath + - if applicable, an additional namespace for more (non-dataclass backed) arguments added to the parser + after initialization. + - The potential list of remaining argument strings. (same as argparse.ArgumentParser.parse_known_args) + """ + if args_filename or (look_for_args_file and len(sys.argv)): + if args_filename: + args_file = Path(args_filename) + else: + args_file = Path(sys.argv[0]).with_suffix(".args") + + if args_file.exists(): + fargs = args_file.read_text().split() + args = fargs + args if args is not None else fargs + sys.argv[1:] + # in case of duplicate arguments the first one has precedence + # so we append rather than prepend. + namespace, remaining_args = self.parse_known_args(args=args) + + # conditionally add delta arguments + deltatype_args = DELTAARGMAP[namespace.delta_type] + self.dataclass_types.append(deltatype_args) + self._add_dataclass_arguments(deltatype_args) + + # parse the arguments again, this time with the specific delta type's arguments + namespace, remaining_args = self.parse_known_args(args=args) + + outputs = [] for dtype in self.dataclass_types: keys = {f.name for f in dataclasses.fields(dtype) if f.init} - inputs = {k: data.pop(k) for k in list(data.keys()) if k in keys} + inputs = {k: v for k, v in vars(namespace).items() if k in keys} + for k in keys: + delattr(namespace, k) obj = dtype(**inputs) outputs.append(obj) - - remain_args = argparse.ArgumentParser() - remain_args.__dict__.update(data) - if return_remaining_args: - return (*outputs, remain_args) + if len(namespace.__dict__) > 0: + # additional namespace. + outputs.append(namespace) + if return_remaining_strings: + return (outputs, remaining_args) else: - return (*outputs,) \ No newline at end of file + if remaining_args: + raise ValueError(f"Some specified arguments are not used by the HfArgumentParser: {remaining_args}") + + return outputs + + # namespace, remaining_args = self.parse_known_args(args=data_list) + + # print("Here", command_line_args, data_list,namespace, remaining_args) + # data.update(remain_args) + + # outputs = [] + # for dtype in self.dataclass_types: + # keys = {f.name for f in dataclasses.fields(dtype) if f.init} + # inputs = {k: namespace.get(k) for k in list(data.keys()) if k in keys} + # obj = dtype(**inputs) + # outputs.append(obj) + + # # remain_args = argparse.ArgumentParser() + # remain_args.__dict__.update(remain_args) + # if return_remaining_args: + # return (*outputs, remain_args) + # else: + # return (*outputs,) + + diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/cola.json b/examples/examples_text-classification/configs/adapter_roberta-base/cola.json new file mode 100644 index 0000000..f2b7146 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/cola.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "cola", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/cola", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "cola", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "cola", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/mnli.json b/examples/examples_text-classification/configs/adapter_roberta-base/mnli.json new file mode 100644 index 0000000..91ecb3e --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/mnli.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/roberta-base/mnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "mnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/mrpc.json b/examples/examples_text-classification/configs/adapter_roberta-base/mrpc.json new file mode 100644 index 0000000..df7a01e --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/mrpc.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mrpc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/mrpc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "mrpc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mrpc", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/qnli.json b/examples/examples_text-classification/configs/adapter_roberta-base/qnli.json new file mode 100644 index 0000000..5292173 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/qnli.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/roberta-base/qnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "qnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/qqp.json b/examples/examples_text-classification/configs/adapter_roberta-base/qqp.json new file mode 100644 index 0000000..471844c --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/qqp.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qqp", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/roberta-base/qqp", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "qqp", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qqp", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/rte.json b/examples/examples_text-classification/configs/adapter_roberta-base/rte.json new file mode 100644 index 0000000..01bef33 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/rte.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/sst2.json b/examples/examples_text-classification/configs/adapter_roberta-base/sst2.json new file mode 100644 index 0000000..8638837 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/sst2.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "sst2", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/roberta-base/sst2", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "sst2", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "sst2", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/stsb.json b/examples/examples_text-classification/configs/adapter_roberta-base/stsb.json new file mode 100644 index 0000000..751ccc1 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/stsb.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "stsb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/stsb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "stsb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "stsb", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/superglue-boolq.json b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-boolq.json new file mode 100644 index 0000000..37fcc44 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-boolq.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/superglue-cb.json b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-cb.json new file mode 100644 index 0000000..5a7c2f8 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-cb.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-cb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/superglue-cb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-cb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-cb", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/superglue-copa.json b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-copa.json new file mode 100644 index 0000000..c7af0f7 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-copa.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-copa", + "eval_steps": 50, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 40, + "output_dir": "outputs/adapter/roberta-base/superglue-copa", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 50, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-copa", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-copa", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/superglue-multirc.json b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-multirc.json new file mode 100644 index 0000000..8625c6c --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-multirc.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-multirc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/roberta-base/superglue-multirc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-multirc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-multirc", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/superglue-record.json b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-record.json new file mode 100644 index 0000000..9326a30 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-record.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-record", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 512, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/roberta-base/superglue-record", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-record", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-record", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/superglue-wic.json b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-wic.json new file mode 100644 index 0000000..f561411 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-wic.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wic", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/superglue-wic", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-wic", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wic", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/superglue-wsc.fixed.json b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-wsc.fixed.json new file mode 100644 index 0000000..a017357 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-wsc.fixed.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wsc.fixed", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/superglue-wsc.fixed", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-wsc.fixed", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wsc.fixed", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/README.md b/examples/legacies/examples_seq2seq/README.md similarity index 100% rename from examples/examples_seq2seq/README.md rename to examples/legacies/examples_seq2seq/README.md diff --git a/examples/examples_seq2seq/metrics/__init__.py b/examples/legacies/examples_seq2seq/__init__.py similarity index 100% rename from examples/examples_seq2seq/metrics/__init__.py rename to examples/legacies/examples_seq2seq/__init__.py diff --git a/examples/examples_seq2seq/configs/config_gen_bs.py b/examples/legacies/examples_seq2seq/configs/config_gen_bs.py similarity index 100% rename from examples/examples_seq2seq/configs/config_gen_bs.py rename to examples/legacies/examples_seq2seq/configs/config_gen_bs.py diff --git a/examples/examples_seq2seq/data_processors/__init__.py b/examples/legacies/examples_seq2seq/data_processors/__init__.py similarity index 100% rename from examples/examples_seq2seq/data_processors/__init__.py rename to examples/legacies/examples_seq2seq/data_processors/__init__.py diff --git a/examples/examples_seq2seq/data_processors/data_collator.py b/examples/legacies/examples_seq2seq/data_processors/data_collator.py similarity index 100% rename from examples/examples_seq2seq/data_processors/data_collator.py rename to examples/legacies/examples_seq2seq/data_processors/data_collator.py diff --git a/examples/examples_seq2seq/data_processors/postprocessors.py b/examples/legacies/examples_seq2seq/data_processors/postprocessors.py similarity index 100% rename from examples/examples_seq2seq/data_processors/postprocessors.py rename to examples/legacies/examples_seq2seq/data_processors/postprocessors.py diff --git a/examples/examples_seq2seq/data_processors/tasks.py b/examples/legacies/examples_seq2seq/data_processors/tasks.py similarity index 100% rename from examples/examples_seq2seq/data_processors/tasks.py rename to examples/legacies/examples_seq2seq/data_processors/tasks.py diff --git a/examples/examples_seq2seq/data_processors/utils.py b/examples/legacies/examples_seq2seq/data_processors/utils.py similarity index 100% rename from examples/examples_seq2seq/data_processors/utils.py rename to examples/legacies/examples_seq2seq/data_processors/utils.py diff --git a/examples/examples_seq2seq/utils/__init__.py b/examples/legacies/examples_seq2seq/metrics/__init__.py similarity index 100% rename from examples/examples_seq2seq/utils/__init__.py rename to examples/legacies/examples_seq2seq/metrics/__init__.py diff --git a/examples/examples_seq2seq/metrics/metrics.py b/examples/legacies/examples_seq2seq/metrics/metrics.py similarity index 100% rename from examples/examples_seq2seq/metrics/metrics.py rename to examples/legacies/examples_seq2seq/metrics/metrics.py diff --git a/examples/examples_seq2seq/metrics/qa_utils.py b/examples/legacies/examples_seq2seq/metrics/qa_utils.py similarity index 100% rename from examples/examples_seq2seq/metrics/qa_utils.py rename to examples/legacies/examples_seq2seq/metrics/qa_utils.py diff --git a/examples/examples_seq2seq/run_seq2seq.py b/examples/legacies/examples_seq2seq/run_seq2seq.py similarity index 100% rename from examples/examples_seq2seq/run_seq2seq.py rename to examples/legacies/examples_seq2seq/run_seq2seq.py diff --git a/examples/examples_seq2seq/seq2seq_trainer.py b/examples/legacies/examples_seq2seq/seq2seq_trainer.py similarity index 100% rename from examples/examples_seq2seq/seq2seq_trainer.py rename to examples/legacies/examples_seq2seq/seq2seq_trainer.py diff --git a/examples/examples_seq2seq/trainers/__init__.py b/examples/legacies/examples_seq2seq/trainers/__init__.py similarity index 100% rename from examples/examples_seq2seq/trainers/__init__.py rename to examples/legacies/examples_seq2seq/trainers/__init__.py diff --git a/examples/examples_seq2seq/trainers/model_args.py b/examples/legacies/examples_seq2seq/trainers/model_args.py similarity index 100% rename from examples/examples_seq2seq/trainers/model_args.py rename to examples/legacies/examples_seq2seq/trainers/model_args.py diff --git a/examples/examples_seq2seq/trainers/seq2seq_trainer.py b/examples/legacies/examples_seq2seq/trainers/seq2seq_trainer.py similarity index 100% rename from examples/examples_seq2seq/trainers/seq2seq_trainer.py rename to examples/legacies/examples_seq2seq/trainers/seq2seq_trainer.py diff --git a/examples/examples_seq2seq/trainers/trainer.py b/examples/legacies/examples_seq2seq/trainers/trainer.py similarity index 100% rename from examples/examples_seq2seq/trainers/trainer.py rename to examples/legacies/examples_seq2seq/trainers/trainer.py diff --git a/examples/examples_seq2seq/trainers/trainer_args.py b/examples/legacies/examples_seq2seq/trainers/trainer_args.py similarity index 100% rename from examples/examples_seq2seq/trainers/trainer_args.py rename to examples/legacies/examples_seq2seq/trainers/trainer_args.py diff --git a/examples/examples_seq2seq/trainers/trainer_utils.py b/examples/legacies/examples_seq2seq/trainers/trainer_utils.py similarity index 100% rename from examples/examples_seq2seq/trainers/trainer_utils.py rename to examples/legacies/examples_seq2seq/trainers/trainer_utils.py diff --git a/examples/legacies/examples_seq2seq/utils/__init__.py b/examples/legacies/examples_seq2seq/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/examples_seq2seq/utils/utils.py b/examples/legacies/examples_seq2seq/utils/utils.py similarity index 100% rename from examples/examples_seq2seq/utils/utils.py rename to examples/legacies/examples_seq2seq/utils/utils.py diff --git a/examples/examples_text-classification/README.md b/examples/legacies/examples_text-classification/README.md similarity index 100% rename from examples/examples_text-classification/README.md rename to examples/legacies/examples_text-classification/README.md diff --git a/examples/examples_text-classification/configs/config_gen.py b/examples/legacies/examples_text-classification/configs/config_gen.py similarity index 96% rename from examples/examples_text-classification/configs/config_gen.py rename to examples/legacies/examples_text-classification/configs/config_gen.py index fb7d35a..096dddb 100644 --- a/examples/examples_text-classification/configs/config_gen.py +++ b/examples/legacies/examples_text-classification/configs/config_gen.py @@ -161,6 +161,20 @@ AllConfigs['adapter_roberta-base'].update({ "output_dir": "outputs/adapter/roberta-base/", }) +AllConfigs['parallel_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['parallel_adapter_roberta-base'].update({ + "delta_type": "parallel_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier", + ], + "bottleneck_dim":24, + "output_dir": "outputs/parallel_adapter/roberta-base/", + }) + AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) AllConfigs['lora_roberta-base'].update({ "delta_type": "lora", diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_cola.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_cola.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_cola.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_cola.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_mnli.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_mnli.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_mnli.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_mnli.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_mrpc.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_mrpc.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_mrpc.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_mrpc.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_qnli.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_qnli.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_qnli.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_qnli.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_qqp.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_qqp.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_qqp.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_qqp.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_rte.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_rte.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_rte.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_rte.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_sst2.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_sst2.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_sst2.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_sst2.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_stsb.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_stsb.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_stsb.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_stsb.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_wnli.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_wnli.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_wnli.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_wnli.json diff --git a/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/cola.json b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/cola.json new file mode 100644 index 0000000..093e646 --- /dev/null +++ b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/cola.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "cola", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/cola", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "cola", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "cola", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/mnli.json b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/mnli.json new file mode 100644 index 0000000..a0dc9ec --- /dev/null +++ b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/mnli.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/mnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "mnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/mrpc.json b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/mrpc.json new file mode 100644 index 0000000..9c9c060 --- /dev/null +++ b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/mrpc.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mrpc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/mrpc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "mrpc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mrpc", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/qnli.json b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/qnli.json new file mode 100644 index 0000000..021ee0e --- /dev/null +++ b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/qnli.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/qnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "qnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/qqp.json b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/qqp.json new file mode 100644 index 0000000..be3afde --- /dev/null +++ b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/qqp.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qqp", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/qqp", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "qqp", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qqp", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/rte.json b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/rte.json new file mode 100644 index 0000000..3a1710f --- /dev/null +++ b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/rte.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/sst2.json b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/sst2.json new file mode 100644 index 0000000..21b6f89 --- /dev/null +++ b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/sst2.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "sst2", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/sst2", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "sst2", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "sst2", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/stsb.json b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/stsb.json new file mode 100644 index 0000000..5845f4f --- /dev/null +++ b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/stsb.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "stsb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/stsb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "stsb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "stsb", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-boolq.json b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-boolq.json new file mode 100644 index 0000000..48747fe --- /dev/null +++ b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-boolq.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-cb.json b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-cb.json new file mode 100644 index 0000000..2e8a874 --- /dev/null +++ b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-cb.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-cb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-cb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-cb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-cb", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-copa.json b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-copa.json new file mode 100644 index 0000000..46c7216 --- /dev/null +++ b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-copa.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-copa", + "eval_steps": 50, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 40, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-copa", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 50, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-copa", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-copa", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-multirc.json b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-multirc.json new file mode 100644 index 0000000..60ba873 --- /dev/null +++ b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-multirc.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-multirc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-multirc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-multirc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-multirc", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-record.json b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-record.json new file mode 100644 index 0000000..4ce9097 --- /dev/null +++ b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-record.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-record", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 512, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-record", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-record", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-record", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wic.json b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wic.json new file mode 100644 index 0000000..c920a7a --- /dev/null +++ b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wic.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wic", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-wic", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-wic", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wic", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wsc.fixed.json b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wsc.fixed.json new file mode 100644 index 0000000..563af04 --- /dev/null +++ b/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wsc.fixed.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wsc.fixed", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-wsc.fixed", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-wsc.fixed", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wsc.fixed", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/cola.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/cola.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/cola.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/cola.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/mnli.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/mnli.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/mnli.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/mnli.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/mrpc.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/mrpc.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/mrpc.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/mrpc.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/qnli.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/qnli.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/qnli.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/qnli.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/qqp.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/qqp.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/qqp.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/qqp.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/rte.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/rte.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/rte.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/rte.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/sst2.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/sst2.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/sst2.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/sst2.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/stsb.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/stsb.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/stsb.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/stsb.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-boolq.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-boolq.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/superglue-boolq.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-boolq.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-cb.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-cb.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/superglue-cb.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-cb.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-copa.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-copa.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/superglue-copa.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-copa.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-multirc.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-multirc.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/superglue-multirc.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-multirc.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-record.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-record.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/superglue-record.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-record.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-wic.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-wic.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/superglue-wic.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-wic.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-wsc.fixed.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-wsc.fixed.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/superglue-wsc.fixed.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-wsc.fixed.json diff --git a/examples/examples_text-classification/metrics/glue.py b/examples/legacies/examples_text-classification/metrics/glue.py similarity index 100% rename from examples/examples_text-classification/metrics/glue.py rename to examples/legacies/examples_text-classification/metrics/glue.py diff --git a/examples/examples_text-classification/requirements.txt b/examples/legacies/examples_text-classification/requirements.txt similarity index 100% rename from examples/examples_text-classification/requirements.txt rename to examples/legacies/examples_text-classification/requirements.txt diff --git a/examples/examples_text-classification/run.sh b/examples/legacies/examples_text-classification/run.sh similarity index 100% rename from examples/examples_text-classification/run.sh rename to examples/legacies/examples_text-classification/run.sh diff --git a/examples/examples_text-classification/run_glue.py b/examples/legacies/examples_text-classification/run_glue.py similarity index 100% rename from examples/examples_text-classification/run_glue.py rename to examples/legacies/examples_text-classification/run_glue.py diff --git a/examples/examples_text-classification/util.py b/examples/legacies/examples_text-classification/util.py similarity index 100% rename from examples/examples_text-classification/util.py rename to examples/legacies/examples_text-classification/util.py diff --git a/examples/setup_seq2seq.py b/examples/legacies/setup_seq2seq.py similarity index 100% rename from examples/setup_seq2seq.py rename to examples/legacies/setup_seq2seq.py diff --git a/examples/setup_prompt.py b/examples/setup_prompt.py deleted file mode 100755 index 5a9c74d..0000000 --- a/examples/setup_prompt.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Install Compacter.""" -import os -import setuptools -from torch.utils.cpp_extension import BuildExtension, CUDAExtension - -#os.environ['TORCH_CUDA_ARCH_LIST']="3.5;3.7;6.1;7.0;7.5;8.6+PTX" - -def setup_package(): - long_description = "examples_prompt" - setuptools.setup( - name='examples_prompt', - version='0.0.1', - description='textual prompt example', - long_description=long_description, - long_description_content_type='text/markdown', - author='Shengding Hu', - license='MIT License', - packages=setuptools.find_packages( - exclude=['docs', 'tests', 'scripts']), - dependency_links=[ - 'https://download.pytorch.org/whl/torch_stable.html', - ], - classifiers=[ - 'Intended Audience :: Developers', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: MIT License', - 'Topic :: Scientific/Engineering :: Artificial Intelligence', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7.10', - ], - keywords='text nlp machinelearning', - # ext_modules=[ - # CUDAExtension('seq2seq.projections.fwh_cuda', - # sources=[ - # 'seq2seq/projections/fwh_cuda/fwh_cpp.cpp', - # 'seq2seq/projections/fwh_cuda/fwh_cu.cu', - # ] - # ) - # ] - # , - cmdclass={"build_ext": BuildExtension}, - install_requires=[ - ], - ) - - -if __name__ == '__main__': - setup_package() diff --git a/examples/tutorial/2_with_bmtrain.py b/examples/tutorial/2_with_bmtrain.py index 7e35189..d543355 100644 --- a/examples/tutorial/2_with_bmtrain.py +++ b/examples/tutorial/2_with_bmtrain.py @@ -12,9 +12,9 @@ def manual_seed(seed): from model_center.model import Bert, BertConfig bmt.init_distributed() -config = BertConfig.from_pretrained("/yinxr/zwl/.cache/model_center/bert-base-uncased") +config = BertConfig.from_pretrained("bert-base-uncased") config.dropout_p = 0 -model = Bert.from_pretrained("/yinxr/zwl/.cache/model_center/bert-base-uncased", config) +model = Bert.from_pretrained("bert-base-uncased", config) print("before modify") od.Visualization(model).structure_graph() @@ -26,7 +26,7 @@ delta_model = LoraModel(backbone_model=model, modified_modules=['project_q', 'pr # delta_model = LowRankAdapterModel(backbone_model=model, modified_modules=['[r]layers\\.(\d)+\\.self_att', '[r]layers\\.(\d)+\\.ffn']) # delta_model = BitFitModel(backbone_model=model, modified_modules=['[r]layers\\.(\d)+\\.self_att', '[r]layers\\.(\d)+\\.ffn', '[r](.*)layernorm(.*)']) -print(delta_model.delta_modules) +# print(delta_model.delta_modules) print("after modify") delta_model.log() diff --git a/examples/unittest/must_try.py b/examples/unittest/must_try.py new file mode 100644 index 0000000..1837a75 --- /dev/null +++ b/examples/unittest/must_try.py @@ -0,0 +1,72 @@ +# use tranformers as usual. +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer +t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large") +t5_tokenizer = AutoTokenizer.from_pretrained("t5-large") +# A running example +inputs_ids = t5_tokenizer.encode("Is Harry Poter wrtten by JKrowling", return_tensors="pt") +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> '? Is it Harry Potter?' + + +# use existing delta models +from opendelta import AutoDeltaModel, AutoDeltaConfig +# use existing delta models from DeltaCenter +delta = AutoDeltaModel.from_finetuned("thunlp/Spelling_Correction_T5_LRAdapter_demo", backbone_model=t5) +# freeze the whole backbone model except the delta models. +delta.freeze_module() +# visualize the change +delta.log() + + +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> Is Harry Potter written by JK Rowling? + + +# Now save merely the delta models, not the whole backbone model, to tmp/ +delta.save_finetuned(".tmp") +import os; os.listdir(".tmp") +# >>> The state dict size is 1.443 MB +# >>> We encourage users to push their final and public models to delta center to share them with the community! + + +# reload the model from local url and add it to pre-trained T5. +t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large") +delta1 = AutoDeltaModel.from_finetuned(".tmp", backbone_model=t5) +import shutil; shutil.rmtree(".tmp") # don't forget to remove the tmp files. +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> Is Harry Potter written by JK Rowling? + +# detach the delta models, the model returns to the unmodified status. +delta1.detach() +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> '? Is it Harry Potter?' + +# use default configuration for cunstomized wrapped models which have PLMs inside. This is a common need for users. +import torch.nn as nn +class WrappedModel(nn.Module): + def __init__(self, inner_model): + super().__init__() + self.inner = inner_model + def forward(self, *args, **kwargs): + return self.inner(*args, **kwargs) + +wrapped_model = WrappedModel(WrappedModel(t5)) + +# say we use LoRA +delta_config = AutoDeltaConfig.from_dict({"delta_type":"lora"}) +delta2 = AutoDeltaModel.from_config(delta_config, backbone_model=wrapped_model) +delta2.log() +# >>> root +# -- inner +# -- inner +# ... +# ... lora_A:[8,1024], lora_B:[1024,8] +delta2.detach() + +# use a not default configuration +# say we add lora to the last four layer of the decoder of t5, with lora rank=5 +delta_config3 = AutoDeltaConfig.from_dict({"delta_type":"lora", "modified_modules":["[r]decoder.*((20)|(21)|(22)|(23)).*DenseReluDense\.wi"], "lora_r":5}) +delta3 = AutoDeltaModel.from_config(delta_config3, backbone_model=wrapped_model) +delta3.log() + + diff --git a/examples/unittest/test_accelerate.py b/examples/unittest/test_accelerate.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/unittest/test_bmtrain.py b/examples/unittest/test_bmtrain.py new file mode 100644 index 0000000..9096fe4 --- /dev/null +++ b/examples/unittest/test_bmtrain.py @@ -0,0 +1,255 @@ +import time +import os + +import torch +import numpy as np +from sklearn.metrics import accuracy_score, recall_score, f1_score + +import bmtrain as bmt + +from model_center import get_args +from model_center.model import Bert +from model_center.tokenizer import BertTokenizer +from model_center.dataset.bertdataset import DATASET +from model_center.utils import print_inspect +from model_center.layer import Linear +from model_center.dataset import DistributedDataLoader + +class BertModel(torch.nn.Module): + def __init__(self, args, num_types): + super().__init__() + self.bert : Bert = Bert.from_pretrained(args.model_config) + dim_model = self.bert.input_embedding.dim_model + self.dense = Linear(dim_model, num_types) + bmt.init_parameters(self.dense) + + def forward(self, *args, **kwargs): + pooler_output = self.bert(*args, **kwargs, output_pooler_output=True).pooler_output + logits = self.dense(pooler_output) + return logits + +def get_tokenizer(args): + tokenizer = BertTokenizer.from_pretrained(args.model_config) + return tokenizer + +def get_model(args): + num_types = { + "BoolQ" : 2, + "CB" : 3, + "COPA" : 1, + "RTE" : 2, + "WiC" : 2, + } + model = BertModel(args, num_types[args.dataset_name]) + return model + +def get_optimizer(args, model): + optimizer = bmt.optim.AdamOffloadOptimizer(model.parameters(), weight_decay=args.weight_decay) + return optimizer + +def get_learning_rate_scheduler(args, optimizer): + if args.lr_decay_iters is None: + args.lr_decay_iters = args.train_iters * args.epochs + if args.lr_decay_style == "noam": + lr_scheduler = bmt.lr_scheduler.Noam(optimizer, + start_lr = args.lr, + warmup_iter = args.warmup_iters, + end_iter = args.lr_decay_iters, + num_iter = args.start_step) + elif args.lr_decay_style == "constant": + lr_scheduler = bmt.lr_scheduler.NoDecay(optimizer, + start_lr = args.lr, + warmup_iter = args.warmup_iters, + end_iter = -1, + num_iter = args.start_step) + elif args.lr_decay_style == "linear": + lr_scheduler = bmt.lr_scheduler.Linear(optimizer, + start_lr = args.lr, + warmup_iter = args.warmup_iters, + end_iter = args.lr_decay_iters, + num_iter = args.start_step) + elif args.lr_decay_style == "exponential": + lr_scheduler = bmt.lr_scheduler.Exponential(optimizer, + start_lr = args.lr, + warmup_iter = args.warmup_iters, + end_iter = args.lr_decay_iters, + num_iter = args.start_step) + elif args.lr_decay_style == "cosine": + lr_scheduler = bmt.lr_scheduler.Cosine(optimizer, + start_lr = args.lr, + warmup_iter = args.warmup_iters, + end_iter = args.lr_decay_iters, + num_iter = args.start_step) + else: + raise ValueError(f"lr_scheduler of type {args.lr_decay_style} is not supported yet.") + + return lr_scheduler + +def setup_model_and_optimizer(args): + # get the tokenizer + tokenizer = get_tokenizer(args) + # get the model + model = get_model(args) + bmt.synchronize() + # get the optimizer and lr_scheduler + optimizer = get_optimizer(args, model) + lr_scheduler = get_learning_rate_scheduler(args, optimizer) + bmt.synchronize() + # get the memory usage + bmt.print_rank("Model mem\n", torch.cuda.memory_summary()) + bmt.synchronize() + return tokenizer, model, optimizer, lr_scheduler + +def initialize(): + # get arguments + args = get_args() + # init bmt + bmt.init_distributed(seed = args.seed) + # init save folder + if args.save != None: + os.makedirs(args.save, exist_ok=True) + return args + +def prepare_dataset(args, tokenizer, base_path, dataset_name, rank, world_size): + splits = ['train', 'dev', 'test'] + dataset = {} + for split in splits: + dataset[split] = DATASET[dataset_name](base_path, split, rank, world_size, tokenizer, args.max_encoder_length) + return dataset + + +def finetune(args, tokenizer, model, optimizer, lr_scheduler, dataset): + loss_func = bmt.loss.FusedCrossEntropy(ignore_index=-100) + + optim_manager = bmt.optim.OptimManager(loss_scale=args.loss_scale) + optim_manager.add_optimizer(optimizer, lr_scheduler) + + print_inspect(model, '*') + + for epoch in range(12): + dataloader = { + "train": DistributedDataLoader(dataset['train'], batch_size=args.batch_size, shuffle=True), + "dev": DistributedDataLoader(dataset['dev'], batch_size=args.batch_size, shuffle=False), + } + + model.train() + for it, data in enumerate(dataloader['train']): + if args.dataset_name == 'COPA': + input_ids0 = data["input_ids0"] + attention_mask0 = data["attention_mask0"] + token_type_ids0 = data["token_type_ids0"] + input_ids1 = data["input_ids1"] + attention_mask1 = data["attention_mask1"] + token_type_ids1 = data["token_type_ids1"] + labels = data["labels"] + else: + input_ids = data["input_ids"] + attention_mask = data["attention_mask"] + token_type_ids = data["token_type_ids"] + labels = data["labels"] + + torch.cuda.synchronize() + st_time = time.time() + + if args.dataset_name == 'COPA': + logits = torch.cat([ + model(input_ids0, attention_mask=attention_mask0, token_type_ids=token_type_ids0), + model(input_ids1, attention_mask=attention_mask1, token_type_ids=token_type_ids1), + ], dim=1) + else: + logits = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) + loss = loss_func(logits.view(-1, logits.shape[-1]), labels.view(-1)) + + global_loss = bmt.sum_loss(loss).item() + + optim_manager.zero_grad() + + optim_manager.backward(loss) + grad_norm = optim_manager.clip_grad_norm(optimizer.param_groups, args.clip_grad, norm_type = 2) + + optim_manager.step() + + torch.cuda.synchronize() + elapsed_time = time.time() - st_time + + bmt.print_rank( + "train | epoch {:3d} | Iter: {:6d}/{:6d} | loss: {:.4f} | lr: {:.4e}, scale: {:10.4f} | grad_norm: {:.4f} | time: {:.3f}".format( + epoch, + it, + len(dataloader["train"]), + global_loss, + lr_scheduler.current_lr, + int(optim_manager.loss_scale), + grad_norm, + elapsed_time, + ) + ) + + model.eval() + with torch.no_grad(): + for split in ['dev']: + pd = [] + gt = [] + for it, data in enumerate(dataloader[split]): + if args.dataset_name == 'COPA': + input_ids0 = data["input_ids0"] + attention_mask0 = data["attention_mask0"] + token_type_ids0 = data["token_type_ids0"] + input_ids1 = data["input_ids1"] + attention_mask1 = data["attention_mask1"] + token_type_ids1 = data["token_type_ids1"] + labels = data["labels"] + logits = torch.cat([ + model(input_ids0, attention_mask=attention_mask0, token_type_ids=token_type_ids0), + model(input_ids1, attention_mask=attention_mask1, token_type_ids=token_type_ids1), + ], dim=1) + else: + input_ids = data["input_ids"] + attention_mask = data["attention_mask"] + token_type_ids = data["token_type_ids"] + labels = data["labels"] + logits = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) + + loss = loss_func(logits.view(-1, logits.shape[-1]), labels.view(-1)) + logits = logits.argmax(dim=-1) + pd.extend(logits.cpu().tolist()) + gt.extend(labels.cpu().tolist()) + + bmt.print_rank( + "{} | epoch {:3d} | Iter: {:6d}/{:6d} | loss: {:.4f}".format( + split, + epoch, + it, + len(dataloader[split]), + loss, + ) + ) + + pd = bmt.gather_result(torch.tensor(pd).int()).cpu().tolist() + gt = bmt.gather_result(torch.tensor(gt).int()).cpu().tolist() + + bmt.print_rank(f"{split} epoch {epoch}:") + if args.dataset_name in ["BoolQ", "CB", "COPA", "RTE", "WiC", "WSC"]: + acc = accuracy_score(gt, pd) + bmt.print_rank(f"accuracy: {acc*100:.2f}") + if args.dataset_name in ["CB"]: + rcl = f1_score(gt, pd, average="macro") + f1 = recall_score(gt, pd, average="macro") + bmt.print_rank(f"recall: {rcl*100:.2f}") + bmt.print_rank(f"Average F1: {f1*100:.2f}") + + +def main(): + args = initialize() + tokenizer, model, optimizer, lr_scheduler = setup_model_and_optimizer(args) + dataset = prepare_dataset( + args, + tokenizer, + f"{args.base_path}/down_data/superglue/", + args.dataset_name, + bmt.rank(), bmt.world_size(), + ) + finetune(args, tokenizer, model, optimizer, lr_scheduler, dataset) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/unittest/test_half_precision.py b/examples/unittest/test_half_precision.py new file mode 100644 index 0000000..e69de29 diff --git a/docs/source/notes/knownissue.md b/examples/unittest/test_main.py similarity index 100% rename from docs/source/notes/knownissue.md rename to examples/unittest/test_main.py diff --git a/examples/unittest/user_defined.py b/examples/unittest/user_defined.py new file mode 100644 index 0000000..cd49e8b --- /dev/null +++ b/examples/unittest/user_defined.py @@ -0,0 +1,182 @@ +# Adapted from Tevatron (https://github.com/texttron/tevatron) + +from argparse import ArgumentParser +import logging +import os +import sys +import torch.nn as nn + +logger = logging.getLogger(__name__) + + +class UnitTest: + def __init__(self, models): + self.models = models + + self.Configs = {} + self.Configs[0] = { + "delta_type": "lora", + } + + self.Configs[1] = { + "delta_type": "bitfit", + } + + self.Configs[2] = { + "delta_type": "adapter", + } + + self.Configs[3] = { + "delta_type": "compacter", + } + + self.Configs[4] = { + "delta_type": "prefix", + } + + self.Configs[5] = { + "delta_type": "soft_prompt", + } + + self.Configs[6] = { + "delta_type": "low_rank_adapter", + } + + def get_delta_config(self, config_id): + return self.Configs[config_id] + + + def unitTest0(self, delta_config_dict): + model = self.models[0] + from opendelta import Visualization + Visualization(model).structure_graph() + + from opendelta import AutoDeltaConfig, AutoDeltaModel + + delta_config = AutoDeltaConfig.from_dict(delta_config_dict) + delta_model = AutoDeltaModel.from_config(delta_config, backbone_model = model) + + from opendelta import Visualization + Visualization(model).structure_graph() + + def unitTest1(self, delta_config_dict): + class Mymodel(nn.Module): + def __init__(self, a,b): + super().__init__() + self.a = a + self.b = b + + model = Mymodel(self.models[0], self.models[1]) + from opendelta import Visualization + Visualization(model).structure_graph() + + from opendelta import AutoDeltaConfig, AutoDeltaModel + + delta_config = AutoDeltaConfig.from_dict(delta_config_dict) + delta_model = AutoDeltaModel.from_config(delta_config, backbone_model = model) + + from opendelta import Visualization + Visualization(model).structure_graph() + delta_model.save_finetuned("./tmp") + + delta_model.freeze_module(exclude=['deltas']) + delta_model.save_finetuned("./tmp") + + model = Mymodel(self.models[0], self.models[1]) + Visualization(model).structure_graph() + delta_model = AutoDeltaModel.from_finetuned("./tmp", backbone_model=model) + Visualization(model).structure_graph() + + + + + + + + def unit_test(self, test_id, config_id): + delta_config_dict = self.Configs[config_id] + if test_id == 0: + self.unitTest0(delta_config_dict) + elif test_id == 1: + self.unitTest1(delta_config_dict) + + +from dataclasses import dataclass, field + +@dataclass +class UnitTestArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + config_id: int = field( + default=0, + ) + test_id: int = field( + default=0, + ) + model_name_or_path: str =field( + default='bert-base-cased', + metadata={"help": "tested: bert-base-cased, roberta-base, rinna/japanese-gpt2-small, t5-small, facebook/opt-125m"} + ) + + +from transformers import HfArgumentParser,TrainingArguments, AutoModel, GPT2Model + +def main(): + parser = HfArgumentParser((TrainingArguments, UnitTestArguments)) + + + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + training_args, unit_test_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + training_args, unit_test_args = parser.parse_args_into_dataclasses() + training_args: TrainingArguments + + if ( + os.path.exists(training_args.output_dir) + and os.listdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir + ): + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + training_args.local_rank, + training_args.device, + training_args.n_gpu, + bool(training_args.local_rank != -1), + training_args.fp16, + ) + logger.info("Training/evaluation parameters %s", training_args) + + + model = AutoModel.from_pretrained(unit_test_args.model_name_or_path) + + import torch + import copy + models = [model, copy.deepcopy(model)] + + + unit_test = UnitTest(models) + + + unit_test.unit_test(unit_test_args.test_id, unit_test_args.config_id) + + + + + + + + +if __name__ == "__main__": + main() diff --git a/opendelta/__init__.py b/opendelta/__init__.py index f9301d2..431cfa1 100644 --- a/opendelta/__init__.py +++ b/opendelta/__init__.py @@ -1,5 +1,5 @@ -__version__ = "0.1.0" +__version__ = "0.3.0" class GlobalSetting: def __init__(self): diff --git a/opendelta/auto_delta.py b/opendelta/auto_delta.py index 6240d0c..8d781f1 100644 --- a/opendelta/auto_delta.py +++ b/opendelta/auto_delta.py @@ -2,7 +2,6 @@ from copy import deepcopy from typing import Any, Dict, OrderedDict from opendelta.utils.visualization import Visualization import torch.nn as nn -from transformers.file_utils import PushToHubMixin from opendelta.utils.logging import get_logger import importlib from opendelta.delta_configs import BaseDeltaConfig @@ -11,13 +10,14 @@ logger = get_logger(__name__) DELTA_CONFIG_MAPPING = { - "lora": "LoraConfig", + "lora": "LoraConfig", "low_rank_adapter": "LowRankAdapterConfig", "bitfit": "BitFitConfig", "adapter":"AdapterConfig", "compacter":"CompacterConfig", "prefix": "PrefixConfig", "soft_prompt": "SoftPromptConfig", + "parallel_adapter": "ParallelAdapterConfig", } DELTA_MODEL_MAPPING = { @@ -28,6 +28,7 @@ DELTA_MODEL_MAPPING = { "compacter": "CompacterModel", "prefix": "PrefixModel", "soft_prompt": "SoftPromptModel", + "parallel_adapter": "ParallelAdapterModel", } class _LazyConfigMapping(OrderedDict): @@ -82,27 +83,30 @@ LAZY_CONFIG_MAPPING = _LazyConfigMapping(DELTA_CONFIG_MAPPING) class AutoDeltaConfig: r""" This is a generic configuration class that will be instantiated as one of the configuration classes of the library - when created with the :py:meth:`~AutoConfig.from_pretrained` class method. + when created with the :meth:`~AutoDeltaConfig.from_finetuned` or :meth:`~AutoDeltaConfig.from_dict` class method. This class cannot be instantiated directly using ``__init__()`` (throws an error). """ - def __init__(self): - raise EnvironmentError( - "AutoConfig is designed to be instantiated " - "using the ``AutoConfig.from_pretrained(pretrained_model_name_or_path)`` method." + def __init__(self, *args, **kwargs): + raise AttributeError( + f"{self.__class__.__name__} is designed to be instantiated using\n\t(1) `{self.__class__.__name__}.from_finetuned(finetuned_model_name_or_path)`\nor\t(2) `{self.__class__.__name__}.from_dict(config_dict, **kwargs)` " ) - + @classmethod def from_dict(cls, config_dict: Dict[str, Any], **kwargs): - r""" Instantiate a DeltaConfig according to the dict. Automatically load the config specified by + r""" Instantiate a DeltaConfig according to the dict. Automatically load the config specified by :obj:`delta_type`. Args: config_dict (:obj:`dict`): The dict of configs of delta model. - kwargs: Other keyword argument pass to initialize the config. + kwargs: Other keyword argument pass to initialize the config. - >>> config = AutoDeltaConfig.from_dict({"delta_type":"lora"}) # This will load the dault lora config. - >>> config = AutoDeltaConfig.from_dict({"delta_type":"lora", "lora_r":5}) # Will load the default lora config, with lora_r = 5 + Examples: + + .. code-block:: python + + config = AutoDeltaConfig.from_dict({"delta_type":"lora"}) # This will load the dault lora config. + config = AutoDeltaConfig.from_dict({"delta_type":"lora", "lora_r":5}) # Will load the default lora config, with lora_r = 5 """ config_dict = deepcopy(config_dict) @@ -114,7 +118,7 @@ class AutoDeltaConfig: @classmethod - def from_finetuned(cls, finetuned_model_name_or_path, **kwargs): + def from_finetuned(cls, finetuned_delta_path, **kwargs): r""" Instantiate one of the configuration classes of the library from a finetuned delta model configuration. The configuration class to instantiate is selected based on the ``delta_type`` property of the config object that @@ -122,76 +126,43 @@ class AutoDeltaConfig: Parameters: - finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, *optional*): - Can be either: + finetuned_delta_path (:obj:`str` or :obj:`os.PathLike`, *optional*): Can be either: + + - A string, the model id of a finetuned delta model configuration hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like ``Davin/lora``, or namespaced under a user or organization name, like ``DeltaHub/lora_t5-base_mrpc``. + - A path to a *directory* containing a configuration file saved using the :py:meth:`~opendelta.basemodel.DeltaBase.save_finetuned` method, e.g., ``./my_model_directory/``. + - A path or url to a saved configuration JSON *file*, e.g.,``./my_model_directory/configuration.json``. - - A string, the *model id* of a finetuned delta model configuration hosted inside a model repo on - huggingface.co. Valid model ids can be located at the root-level, like ``Davin/lora``, or - namespaced under a user or organization name, like ``DeltaHub/lora_t5-base_mrpc``. - - A path to a *directory* containing a configuration file saved using the - :py:meth:`DeltaBase.save_finetuned` method, - e.g., ``./my_model_directory/``. - - A path or url to a saved configuration JSON *file*, e.g., - ``./my_model_directory/configuration.json``. - The last two option are not tested but inherited from huggingface. cache_dir (:obj:`str` or :obj:`os.PathLike`, *optional*): Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used. - force_download (:obj:`bool`, *optional*, defaults to :obj:`False`): - Whether or not to force the (re-)download the model weights and configuration files and override the - cached versions if they exist. - resume_download (:obj:`bool`, *optional*, defaults to :obj:`False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. - proxies (:obj:`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - revision(:obj:`str`, *optional*, defaults to ``"main"``): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any - identifier allowed by git. - return_unused_kwargs (:obj:`bool`, *optional*, defaults to ``False``): - If ``False``, then this function returns just the final configuration object. - If ``True``, then this functions returns a ``Tuple(config, unused_kwargs)`` where *unused_kwargs* is a - dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., the - part of ``kwargs`` which has not been used to update ``config`` and is otherwise ignored. - trust_remote_code (:obj:`bool`, *optional*, defaults to ``False``): - Whether or not to allow for custom models defined on the Hub in their own modeling files. This option - should only be set to ``True`` for repositories you trust and in which you have read the code, as it will - execute code present on the Hub on your local machine. - kwargs(additional keyword arguments, *optional*): - The values in kwargs of any keys which are configuration attributes will be used to override the loaded - values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled - by the ``return_unused_kwargs`` keyword parameter. - + Examples: - + .. code-block:: python from transformers import AutoConfig - delta_config = AutoDeltaConfig.from_finetuned("DeltaHub/lora_t5-base-mrpc") + delta_config = AutoDeltaConfig.from_finetuned("thunlp/FactQA_T5-large_Adapter") """ - kwargs["name_or_path"] = finetuned_model_name_or_path - config_dict, _ = BaseDeltaConfig.get_config_dict(finetuned_model_name_or_path, **kwargs) + config_dict, kwargs = BaseDeltaConfig.get_config_dict(finetuned_delta_path, **kwargs) if "delta_type" in config_dict: config_class = LAZY_CONFIG_MAPPING[config_dict["delta_type"]] return config_class.from_dict(config_dict, **kwargs) else: # Fallback: use pattern matching on the string. for pattern, config_class in LAZY_CONFIG_MAPPING.items(): - if pattern in str(finetuned_model_name_or_path): + if pattern in str(finetuned_delta_path): return config_class.from_dict(config_dict, **kwargs) raise ValueError( - f"Unrecognized model in {finetuned_model_name_or_path}. " + f"Unrecognized model in {finetuned_delta_path}. " f"Should have a `delta_type` key in the loaded config, or contain one of the following strings " f"in its name: {', '.join(LAZY_CONFIG_MAPPING.keys())}" ) -### AutoModels below +### AutoModels below class _LazyAutoMapping(OrderedDict): """ @@ -318,25 +289,29 @@ class AutoDeltaModel: """ _delta_model_mapping = LAZY_DELTA_MAPPING def __init__(self, *args, **kwargs): - raise EnvironmentError( - f"{self.__class__.__name__} is designed to be instantiated " - f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or " - f"`{self.__class__.__name__}.from_config(config)` methods." + # raise EnvironmentError( + # f"{self.__class__.__name__} is designed to be instantiated " + # f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or " + # f"`{self.__class__.__name__}.from_config(config)` methods." + # ) + + raise AttributeError( + f"{self.__class__.__name__} is designed to be instantiated using\n\t(1) `{self.__class__.__name__}.from_finetuned(finetuned_delta_path, backbone_model, *model_args, **kwargs)`\nor\t(2) `{self.__class__.__name__}.from_config(delta_config, backbone_model, **kwargs)`" ) - + @classmethod - def from_config(cls, config, backbone_model, **kwargs): #-> "DeltaBase": + def from_config(cls, config, backbone_model, **kwargs) -> DeltaBase: r"""Automatically instantiates a delta model based on the :obj:`config`. The delta model correspond to the delta - :obj:`config` will be loaded and initialized using the arguments in :obj:`config`. + :obj:`config` will be loaded and initialized using the arguments in :obj:`config`. .. note:: - Only using :meth:`from_config` method will not load the finetuned weight file (e.g., pytorch_model.bin). - Please use from_finetuned directly. + Only using :meth:`from_config` method will not load the finetuned weight file (e.g., pytorch_model.bin). + Please use from_finetuned directly. Args: config (:obj:`BaseDeltaConfig`): backbone_model (:obj:`nn.Module`): - + Examples: .. code-block:: python @@ -355,53 +330,47 @@ class AutoDeltaModel: ) @classmethod - def from_finetuned(cls, finetuned_model_name_or_path, backbone_model, *model_args, **kwargs): - r""" Automatically instantiated a delta model and load the finetuned checkpoints based on the - :obj:`finetuned_model_name_or_path`, which can either be a string pointing to a local path or a url pointint to - the delta hub. It will check the hash after loading the delta model to see whether the correct backbone and - delta checkpoint are used. + def from_finetuned(cls, finetuned_delta_path, backbone_model, *model_args, **kwargs) -> DeltaBase: + r""" Automatically instantiated a delta model and load the finetuned checkpoints based on the + :obj:`finetuned_delta_path`, which can either be a string pointing to a local path or a url pointint to + the delta hub. It will check the hash after loading the delta model to see whether the correct backbone and + delta checkpoint are used. Args: - finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, *optional*): - Can be either: + finetuned_delta_path (:obj:`str` or :obj:`os.PathLike`, *optional*): Can be either: - - A string, the *model id* of a finetuned delta model configuration hosted inside a model repo on - huggingface.co. Valid model ids can be located at the root-level, like ``Davin/lora``, or - namespaced under a user or organization name, like ``DeltaHub/lora_t5-base_mrpc``. - - A path to a *directory* containing a configuration file saved using the - :py:meth:`DeltaBase.save_finetuned` method, - e.g., ``./my_model_directory/``. - - A path or url to a saved configuration JSON *file*, e.g., - ``./my_model_directory/configuration.json``. - The last two option are not tested but inherited from huggingface. + - A string, the model name of a finetuned delta model configuration hosted inside a model repo on `Delta Center `_, like ``thunlp/FactQA_T5-large_Adapter``. + - A path to a directory containing a configuration file saved using the :meth:`~opendelta.utils.saving_loading_utils.SaveLoadMixin.save_finetuned` method, e.g., ``./my_model_directory/``. + - A path or url to a saved configuration JSON *file*, e.g., ``./my_model_directory/configuration.json``.The last two option are not tested but inherited from huggingface. backbone_model (:obj:`nn.Module`): The backbone model to be modified. - model_args: Other argument for initialize the model. + model_args: Other argument for initialize the model. See :`DeltaBase.from_finetuned` for details. + kwargs: Other kwargs that will be passed into DeltaBase.from_finetuned. See `DeltaBase.from_finetuned` for details. Example: - + .. code-block:: python - delta_model = AutoDeltaModel.from_finetuned("DeltaHub/lora_t5-base-mrpc", backbone_model) + delta_model = AutoDeltaModel.from_finetuned("thunlp/FactQA_T5-large_Adapter", backbone_model=5) """ - config = kwargs.pop("config", None) + delta_config = kwargs.pop("delta_config", None) - if not isinstance(config, BaseDeltaConfig): - config, kwargs = AutoDeltaConfig.from_finetuned( - finetuned_model_name_or_path, return_unused_kwargs=True, **kwargs + if not isinstance(delta_config, BaseDeltaConfig): + delta_config, kwargs = AutoDeltaConfig.from_finetuned( + finetuned_delta_path, return_unused_kwargs=True, **kwargs ) - if type(config) in cls._delta_model_mapping.keys(): - model_class = cls._delta_model_mapping[type(config)] - return model_class.from_finetuned(finetuned_model_name_or_path, backbone_model, *model_args, **kwargs) + if type(delta_config) in cls._delta_model_mapping.keys(): + model_class = cls._delta_model_mapping[type(delta_config)] + return model_class.from_finetuned(finetuned_delta_path, backbone_model, *model_args, delta_config=delta_config, **kwargs) raise ValueError( f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}." ) - - + + if __name__ == "__main__": diff --git a/opendelta/basemodel.py b/opendelta/basemodel.py index 06b09c8..27ff96d 100644 --- a/opendelta/basemodel.py +++ b/opendelta/basemodel.py @@ -3,6 +3,7 @@ from collections import OrderedDict from multiprocessing.sharedctypes import Value import os +from turtle import back from opendelta.delta_configs import BaseDeltaConfig from opendelta.utils.model_md5 import gen_model_hash from opendelta.utils.signature import get_arg_names, signature @@ -23,17 +24,17 @@ from opendelta.utils.structure_mapping import CommonStructureMap from opendelta.utils.interactive.web import interactive from opendelta.utils.data_parallel import new_replicate_for_data_parallel from opendelta.utils.cuda import move_dict_to_cuda +import sys +from opendelta.utils.data_parallel import caller_map logger = logging.get_logger(__name__) def is_leaf_module(module): r"""Whether the module is a leaf module """ - try: - return len([n for n,_ in module.named_children()]) == 0 - except: - from IPython import embed - embed() + return len([n for n,_ in module.named_children()]) == 0 + + def non_module_param(module: nn.Module): module_names = [n for n, _ in module.named_modules()] @@ -92,6 +93,7 @@ class DeltaBase(nn.Module, SaveLoadMixin): default_exclude_modules = ["lm_head"] config_class = BaseDeltaConfig default_unfrozen_modules = ["deltas"] + _need_pseudo_data = True def __init__(self, backbone_model: nn.Module, modified_modules: Optional[List[str]] = None, @@ -99,6 +101,7 @@ class DeltaBase(nn.Module, SaveLoadMixin): unfrozen_modules: Optional[List[str]] = None, interactive_modify: Optional[Union[bool, int]] = False, common_structure: Optional[bool] = False, + framework_type: Optional[str]= "hf", # select from ["hf", "bmt"] ): nn.Module.__init__(self) # register the backbone model after init using self.__dict__ method to avoid adding backbone_model @@ -129,15 +132,16 @@ class DeltaBase(nn.Module, SaveLoadMixin): self.exclude_modules = self.default_exclude_modules self.common_structure = common_structure if self.common_structure: - self.structure_mapping = CommonStructureMap.load(self.backbone_model) + self.structure_mapping = CommonStructureMap(self.backbone_model) else: self.structure_mapping = None if unfrozen_modules is None: self.unfrozen_modules = self.default_unfrozen_modules if self.common_structure and self.structure_mapping is None: raise RuntimeError("Using common structure but the structure mapping is None") + self.framework_type = framework_type - def forward(self, *args, **kwargs) -> "RuntimeError": + def forward(self, *args, **kwargs) -> RuntimeError: r""" .. warning:: @@ -197,15 +201,25 @@ class DeltaBase(nn.Module, SaveLoadMixin): # create a new key list to avoid recursion. backbone_key_list = [key for key, _ in backbone.named_modules()] for key in backbone_key_list: - if self.find_key(key, modified_modules): #TODO may have bugs when commonstructure has a virtual node and it's refered - logger.debug("find key: {}".format(key)) + if self.find_key(key, modified_modules): self.update_module(backbone, key) - self._pseudo_data_to_instantiate(backbone) + if self._need_pseudo_data: + self._pseudo_data_to_instantiate(backbone) + # mark the paratmers that are the delta parameters for easily displaying the delta_paramters. self.mark_as_delta() return backbone - + def _pseudo_data_to_instantiate(self, backbone: Optional[nn.Module]=None): + if self.structure_mapping is None: + self._pseudo_data_to_instantiate_module(backbone) + else: + for key in self.structure_mapping.matched_pairs: + if key == "": + submodule = backbone + else: + _, _, submodule = self.find_module(backbone, key) + self._pseudo_data_to_instantiate_module(submodule) def mark_as_delta(self, module: nn.Module=None,): r"""[NODOC] Mark :obj:`module`'s all parameters as delta parameters by setting a ``_is_delta`` attribute to each of them. @@ -277,21 +291,23 @@ class DeltaBase(nn.Module, SaveLoadMixin): if is_leaf_module(module): for n, p in module.named_parameters(): - if self.find_key(".".join([prefix,n]), exclude): + next_prefix = n if prefix == "" else ".".join([prefix,n]) + if self.find_key(next_prefix, exclude): continue if "deltas" not in exclude or (not (hasattr(p, "_is_delta") and getattr(p, "_is_delta"))): p.requires_grad = False return else: for n, c in module.named_children(): - if self.find_key(".".join([prefix,n]), exclude): # if found, untouch the parameters + next_prefix = n if prefix == "" else ".".join([prefix,n]) + if self.find_key(next_prefix, exclude): # if found, untouch the parameters continue else: # firstly freeze the non module params, then go deeper. params = non_module_param(module) for n, p in params: if "deltas" not in exclude or (not (hasattr(p, "_is_delta") and getattr(p, "_is_delta"))): p.requires_grad = False - self._freeze_module_recursive(c, exclude=exclude, prefix=".".join([prefix,n]) ) + self._freeze_module_recursive(c, exclude=exclude, prefix=next_prefix) @@ -311,19 +327,24 @@ class DeltaBase(nn.Module, SaveLoadMixin): for x in self.exclude_modules: if key.startswith(x): # start with the excluded key return False - if self.common_structure: - key = self.structure_mapping.transform(key, strict=False) + virtual_key, in_virtual_order = None, None + if self.structure_mapping is not None: + key, virtual_key, in_virtual_order = self.structure_mapping.transform(key, strict=False) + # currently in_virtual_order not in use, it means that if the common structure designate adding adapter to FFN, it will be add to all submodule of FFN. if not key: return False - try: + if virtual_key is None: return endswith_in(key, target_list) - except: - raise RuntimeError("find_key exception") + else: + return endswith_in(key, target_list) or endswith_in(virtual_key, target_list) - def _pseudo_data_to_instantiate(self, module: Optional[nn.Module]=None): - r"""Create a pseudo_data into the module to know the dimemsion of each tensor in the computation graph. - First try to use the dummy_inputs of the pretrained model. If the model has no dummy_inputs, will try to create - integer tensor as the pseudo_input, if ``decoder_input_ids`` is in the model's forward function, additional create it. + + def _pseudo_data_to_instantiate_module(self, module: Optional[nn.Module]=None): + r"""Some delta model requires a pseudo-data be passed through the model to understand the dimensionality of each tensor in the computation graph. + + (1) The model in the Huggingface Transformers library usually has the so-called `dummy_inputs`. We will make use of it. + (2) If the model does not have `dummy_inputs`, we will try to create it and throw a warning. + (3) If we encounter an error in (2), we will suggest you to create it by passing the dummy_inputs variable. Args: module (:obj:`nn.Module`, *optional*, default to :obj:`None`): The backbone model. @@ -332,17 +353,32 @@ class DeltaBase(nn.Module, SaveLoadMixin): if module is None: module = self.backbone_model device = get_device(module) + _auto_dummy = False try: dummy_inputs = module.dummy_inputs dummy_inputs = move_dict_to_cuda(dummy_inputs, device) - module(**dummy_inputs) except AttributeError: - logger.warning("No dummy_inputs attributes, create a common input_ids for input.") - pseudo_input = torch.tensor([[0,0]]).to(device) + logger.warning(f"No `dummy_inputs` attribute in {module.__class__.__name__} , automatically create `dummy_inputs`. Very likely to encounter error. To set dummy_inputs for your model, please use: `setattr(backbone_model, 'dummy_inputs', some_dummy_inputs)` before initializing `{self.__class__.__name__}`") + _auto_dummy = True + pass + if _auto_dummy: + _most_simple_input = torch.tensor([[0,0]]).to(device) if "decoder_input_ids" in signature(module.forward).args: - module(pseudo_input, decoder_input_ids = pseudo_input) + dummy_inputs = {"input_ids": _most_simple_input, "decoder_input_ids": _most_simple_input} else: - module(pseudo_input) + dummy_inputs = {"input_ids": _most_simple_input} + + _auto_dummy_fail = False + try: + module(**dummy_inputs) + except: + _auto_dummy_fail = True + if _auto_dummy_fail: + raise AttributeError(f"\n\tThe {self.__class__.__name__} requires a dummy_inputs to be passed through the model to understand the dimensionality of each tensor in the computation graph. \n\t The {module.__class__.__name__} Class has no dummy_inputs, and automatically created dummy_inputs failed.\n\t Refer to `https://opendelta.readthedocs.io/en/latest/notes/faq.html` for detail.") + + + + def trainable_parameters_names(self, module: Optional[nn.Module]=None): r"""[NODOC] A small sugar function to return all the trainable parameter's name in the (by default, backbone) model. @@ -498,7 +534,7 @@ class DeltaBase(nn.Module, SaveLoadMixin): """ raise NotImplementedError - def insert_sequential_module(self, module, delta_module=None, delta_name='delta', strict=False, _delta_info=None): + def insert_module(self, module, method='sequential', delta_module=None, delta_name='delta', strict=False, _delta_info=None): r"""insert a module (previous not exists in the code base) before/after a module. Specifically, it modifies the forward function of the original module to firstly pass the arguments into the new module's forward function and then pass it into the original ones. The new module can also be inserted after the original module with similar mechanism. @@ -514,17 +550,6 @@ class DeltaBase(nn.Module, SaveLoadMixin): original delta is passed through ``_delta_info``. """ - def _caller(_org_func, org_module, delta_name, *args, **kwargs): - args = args[1:] # the first argument here is ``self`` - delta_module = getattr(org_module, delta_name) - if hasattr(delta_module, "pre_forward"):# is not None: - args, kwargs = delta_module.pre_forward(*args, **kwargs) - # from IPython import embed - # embed(header = "true") - ret = _org_func(*args, **kwargs) - if hasattr(delta_module, "post_forward"):# is not None: - ret = delta_module.post_forward(ret) - return ret if strict: @@ -535,9 +560,9 @@ class DeltaBase(nn.Module, SaveLoadMixin): if _delta_info is None: if delta_module is None: raise RuntimeError("delta module can't be none to ensure successful replicate of the parent module.") - - _delta_info = {"method": "insert_sequential", - "delta_module": delta_module, + + _delta_info = {"method": method, + "delta_module": delta_module, "delta_name": delta_name, "delta_belong": self, "state": "on"} @@ -549,12 +574,36 @@ class DeltaBase(nn.Module, SaveLoadMixin): setattr(module, _delta_info['delta_name'], _delta_info["delta_module"]) - new_forward = decorate(module.forward, _caller, extras=(module, _delta_info['delta_name']), kwsyntax=True) # decorator.decorate helps preserving the functions metadata (signature, etc.). - module.forward = new_forward.__get__(module, type(module)) # func.__get__(object, type(object)) register a function as an object's method - # for DataParallel's copy behavior. Experimental: - # may have bugs when module.forward is nestedly wrapped. - module._replicate_for_data_parallel = new_replicate_for_data_parallel.__get__(module, type(module)) + if _delta_info["method"] in caller_map.keys(): + caller = caller_map[_delta_info["method"]] + new_forward = decorate(module.forward, caller, extras=(module, _delta_info['delta_name']), kwsyntax=True) # decorator.decorate helps preserving the functions metadata (signature, etc.). + module.forward = new_forward.__get__(module, type(module)) # func.__get__(object, type(object)) register a function as an object's method + # for DataParallel's copy behavior. Experimental: + # may have bugs when module.forward is nestedly wrapped. + module._replicate_for_data_parallel = new_replicate_for_data_parallel.__get__(module, type(module)) + else: + raise NotImplementedError(f"_delta_info['method']=='{_delta_info['method']}' is not supported") + + + def insert_sequential_module(self, module, delta_module=None, delta_name='delta', strict=False, _delta_info=None): + r"""insert a module (previous not exists in the code base) before/after a module. Specifically, it modifies the forward + function of the original module to firstly pass the arguments into the new module's forward function and then pass + it into the original ones. The new module can also be inserted after the original module with similar mechanism. + + When implementing the new module , researchers should be aware of the components of arguments of the original module's forward function. + + Args: + module: (:obj:`nn.Module`): The (sub)module to inserted a delta module. + delta_module: (:obj:`DeltaBase`): The delta module to be inserted. + name: (:obj:`str`, *optional*): The name of the delta in the backbone module. + strict: (:obj:`bool`, *optional*): Whether to prohibit modify a modified module. + _delta_info (:obj:`Dict`, *optional*): Used in attach(), reattach a delta module to backbone. The info of + original delta is passed through ``_delta_info``. + + """ + self.insert_module(module, "sequential", delta_module, delta_name, strict, _delta_info) + def insert_parallel_module(self, module, delta_module=None, delta_name='delta', strict=False, _delta_info=None): """insert a module (previous not exists in the code base) across a module. Specifically, it modifies the forward @@ -573,41 +622,8 @@ class DeltaBase(nn.Module, SaveLoadMixin): """ - def _caller(_org_func, org_module, delta_name, *args, **kwargs): - args = args[1:] # the first argument here is ``self`` - delta_module = getattr(org_module, delta_name) - ret_1 = _org_func(*args, **kwargs) - ret_2 = delta_module.forward(*args, **kwargs) - return ret_1 + ret_2 - - if strict: - if hasattr(module.forward, "__wrapped__"): - raise RuntimeWarning("The forward function might have been wrapped by a decorator, is it intended?") - - # record info for plug and unplug and nested wrap - if _delta_info is None: - if delta_module is None: - raise RuntimeError("delta module can't be none to ensure successful replicate of the parent module.") - - _delta_info = {"method": "insert_parallel", - "delta_module": delta_module, - "delta_name": delta_name, - "delta_belong": self, - "state": "on"} - self._register_delta_infos(parent_module=module, - _delta_info = _delta_info) - else: - delta_module = _delta_info["delta_module"] - delta_name = _delta_info["delta_name"] - - setattr(module, _delta_info['delta_name'], _delta_info["delta_module"]) - - new_forward = decorate(module.forward, _caller, extras=(module, _delta_info['delta_name']), kwsyntax=True) # decorator.decorate helps preserving the functions metadata (signature, etc.). - module.forward = new_forward.__get__(module, type(module)) # func.__get__(object, type(object)) register a function as an object's method - # for DataParallel's copy behavior. Experimental: - # may have bugs when module.forward is nestedly wrapped. - module._replicate_for_data_parallel = new_replicate_for_data_parallel.__get__(module, type(module)) - + self.insert_module(module, "parallel", delta_module, delta_name, strict, _delta_info) + def set_active_state_dict(self, module: nn.Module): r"""modify the state_dict function of the model (by default, the backbone model) to return only the tunable part. @@ -623,8 +639,6 @@ class DeltaBase(nn.Module, SaveLoadMixin): state_dict.pop(n) return state_dict includes = self.trainable_parameters_names(module) # use excludes will have trouble when the model have shared weights - # print(includes, "grad:",self.backbone_model.plm.lm_head.weight.requires_grad) - # exit() if hasattr(module.state_dict, "__wrapped__"): raise RuntimeWarning("The forward function might have been wrapped by a decorator, is it intended? Do you freeze the parameters twice?") module.state_dict = decorate(module.state_dict, _caller, extras=(includes,), kwsyntax=True) # decorator.decorate helps preserving the functions metadata (signature, etc.). @@ -669,21 +683,46 @@ class DeltaBase(nn.Module, SaveLoadMixin): if visualization: from opendelta import Visualization Visualization(module).structure_graph() + + self.get_statistics(module) if trainable_ratio: - n_trainable = self.num_trainable_parameters(module) - n_total = self.num_total_parameters(module) - logger.info("Trainable Ratio: {:2f}%".format(n_trainable/n_total*100)) + logger.info("Trainable Ratio: {:2f}%".format(self.stat['trainable_ratio']*100)) if delta_ratio: - n_delta = self.num_delta_parameters(module) - n_total = self.num_total_parameters(module) - logger.info("Delta Parameter Ratio: {:2f}%".format(n_delta/n_total*100)) + logger.info("Delta Parameter Ratio: {:2f}%".format(self.stat['delta_ratio']*100)) if cuda_memory: - cudamem = 0 - maxcudamem = 0 - for device_id in range(torch.cuda.device_count()): - cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3 - maxcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3 - logger.info("Static Memory {:.2f} GB, Max Memory {:.2f} GB".format(cudamem, maxcudamem)) + logger.info("Static Memory {:.2f} GB, Max Memory {:.2f} GB".format(self.stat['cudamem'], self.stat['maxcudamem'])) + + + def get_statistics(self, module=None): + r"""Get the statistics of the parameters in the delta modules. + + Args: + module (:obj:`nn.Module`, *optional*): The module to compute the statistics. + + Returns: + :obj:`dict`: The statistics of the parameters in the delta modules. + + """ + if module is None: + module = self.backbone_model + + self.stat = {} + n_trainable = self.num_trainable_parameters(module) + n_total = self.num_total_parameters(module) + + self.stat['trainable_ratio'] = n_trainable/n_total + + n_delta = self.num_delta_parameters(module) + n_total = self.num_total_parameters(module) + self.stat['delta_ratio'] = n_delta/n_total + + cudamem = 0 + maxcudamem = 0 + for device_id in range(torch.cuda.device_count()): + cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3 + maxcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3 + self.stat['cudamem'] = cudamem + self.stat['maxcudamem'] = maxcudamem @@ -767,13 +806,7 @@ class DeltaBase(nn.Module, SaveLoadMixin): if _delta_info['method'] == "replace": setattr(submodule, _delta_info["child_name"], _delta_info['org_module']) - elif _delta_info['method'] == "insert_sequential": - if hasattr(submodule.forward, "__wrapped__"): - submodule.forward = submodule.forward.__wrapped__ - delattr(submodule, _delta_info["delta_name"]) - else: - raise AttributeError("submodule {}'s forward has no attribute __wrapped__. It's not a wrapped function.".format(name)) - elif _delta_info['method'] == "insert_parallel": + elif _delta_info['method'] in ["sequential", "before", "after", "parallel"]: if hasattr(submodule.forward, "__wrapped__"): submodule.forward = submodule.forward.__wrapped__ delattr(submodule, _delta_info["delta_name"]) diff --git a/opendelta/delta_configs.py b/opendelta/delta_configs.py index 5e789ef..ca722e4 100644 --- a/opendelta/delta_configs.py +++ b/opendelta/delta_configs.py @@ -5,15 +5,6 @@ from opendelta import __version__ as opendelta_version from opendelta.utils import logging from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func import transformers -from transformers.file_utils import ( - PushToHubMixin, - is_offline_mode, - cached_path, - is_remote_url, - get_list_of_files, - hf_bucket_url, -) -from packaging import version import json import copy @@ -26,57 +17,44 @@ logger = logging.get_logger(__name__) FULL_CONFIGURATION_FILE = "config.json" _re_configuration_file = re.compile(r"config\.(.*)\.json") -class BaseDeltaConfig(PushToHubMixin): +class BaseDeltaConfig: r"""Base class for all configuration classes. Handles a few parameters common to all delta models' configurations as well as methods for loading/downloading/saving configurations. Class attributes (overridden by derived classes): - - **delta_type** (:obj:`str`) -- the name of the delta modules, used to create the correct :py:class:`~opendelta.AutoConfig`. + - **delta_type** (:obj:`str`) -- the name of the delta modules, used to create the correct :py:class:`~opendelta.AutoConfig`. Args: - modified_modules (:obj:`List[str]`, *optional*, defaults to :obj:``None``) + modified_modules (:obj:`List[str]`, *optional*, defaults to :obj:`None`) The list of keys to determine which modules you want to modify. OpenDelta will take every modulees that **ends with** the one of the provided keys as the modification target. When not given any value, i.e. ``modified_modules=None``, the delta module will use the it corresponding default modification modules. Taking DistilBertModel with an classifier on top as an example: .. note:: - **Examples**: When adding delta to DistilBertModel, - - 1. set to ``["0.attention.out_lin"]`` will add delta modules to the attention output of distilbert's - ayer 0, i.e., ``distilbert.transformer.layer.0.attention.out_lin``. + **Examples**: When adding delta to `DistilBertModel `_, + + 1. set to ``["0.attention.out_lin"]`` will add delta modules to the attention output of distilbert's layer 0, i.e., ``distilbert.transformer.layer.0.attention.out_lin``. 2. set to ``["attention.out_lin"]`` will add the delta modules in every layer's ``attention.out_lin``. - unfrozen_modules (:obj:`List[str]`, *optional*, defaults to :obj:`["deltas"]` ) - exclude_modules (:obj:`str`, *optional*, default to :obj:`None`): The modules starts with these strings will - be excluded in modification. Note that currently only plain text (no regular expression) is supported. + unfrozen_modules (:obj:`List[str]`, *optional*, defaults to :obj:`["deltas"]` ): The modules that are unfrozen + during training in :meth:`~opendelta.basemodel.DeltaBase.freeze_module`, which includes the ones that are newly introduced as delta modules, and the ones that are originally a part of the model but set to trainable (:obj:`requires_grad=True`) to train together with the delta modules. Opendelta will take every modules that **ends with** the one of the provided keys and all its sub-modules and paramters as trainable. - The modules that are unfrozen - during training. Including the ones that are newly introduced as delta modules, and the ones that are - originally a part of the model but set to trainable (:obj:`requires_grad=True`) to train together with the - delta modules. OpenDelta will take every modules that **ends with** the one of the provided keys and all - its sub-modules and paramters as trainable. + exclude_modules (:obj:`str`, *optional*, default to :obj:`None`): The modules starts with these strings will be excluded in modification. Note that currently only plain text (no regular expression) is supported. .. note:: + **Examples**: When adding delta to DistilBertModel, - + 1. set this argument to ``["bias"]`` will make all bias terms tunable. - 2. set this argument to ``["attention"]`` will make all parameters in all attention modules tunable. - - 3. set this argument to ``["deltas"]`` will make all the parameters in the newly introduced delta - modules tunable. - + 3. set this argument to ``["deltas"]`` will make all the parameters in the newly introduced delta modules tunable. 4. set this argument to ``["classifier"]`` will make all parameters in the classifier tunable. + 5. set this argument to ``["3.ffn.lin2", "deltas", "classifier"]``, will make all parameters in the third layer's feed forward layer's send linear layer, the detla modules, and the classifiers modules tunable. - 5. set this argument to ``["3.ffn.lin2", "deltas", "classifier"]``, will make all parameters in - the third layer's feed forward layer's send linear layer, the detla modules, and the classifiers modules - tunable. - - common_structure (:obj:`bool`, *optional*, default to :obj:`None`): Whether using the common structure mapping of - the transformer model when designating :obj:`modified_modules` and :obj:`unfrozen_modules`. + common_structure (:obj:`bool`, *optional*, default to :obj:`None`): Whether using the common structure mapping of the transformer model when designating ``modified_modules` and ``unfrozen_modules``. backbone_class (:obj:`str`, *optional*, default to :obj:`None`): The name of backbone model's class, e.g. ``RobertaForMaskedLM``. Saving this infomation let the users explicitly know on which backbone the delta model is trained. @@ -108,20 +86,20 @@ class BaseDeltaConfig(PushToHubMixin): @classmethod - def from_finetuned(cls, finetuned_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "BaseDeltaConfig": + def from_finetuned(cls, finetuned_delta_path: Union[str, os.PathLike], **kwargs) -> "BaseDeltaConfig": r""" Instantiate a :obj:`BaseDeltaConfig` (or a derived class) from a finetined delta module configuration. Args: finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): This can be either: - * a string, the *model id* of a finetuned delta model configuration hosted inside a model repo on + - a string, the *model id* of a finetuned delta model configuration hosted inside a model repo on deltahub.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. - - * a path to a *directory* containing a configuration file saved using the :meth:`BaseDeltaConfig.save_finetuned` method, e.g., ``./my_model_directory/``. - - * a path or url to a saved configuration JSON *file*, e.g., ``./my_model_directory/configuration.json``. + + - a path to a *directory* containing a configuration file saved using the :meth:`BaseDeltaConfig.save_finetuned` method, e.g., ``./my_model_directory/``. + + - a path or url to a saved configuration JSON *file*, e.g., ``./my_model_directory/configuration.json``. cache_dir (:obj:`str` or :obj:`os.PathLike`, *optional*): Path to a directory in which a downloaded pretrained delta model configuration should be cached if the @@ -129,10 +107,10 @@ class BaseDeltaConfig(PushToHubMixin): .. code-block:: python - delta_config = LoraConfig.from_finetuned("DeltaHub/lora_t5-base_mrpc") + delta_config = AdapterConfig.from_finetuned("thunlp/FactQA_T5-large_Adapter", backbone_model=t5) """ - config_dict, kwargs = cls.get_config_dict(finetuned_model_name_or_path, **kwargs) + config_dict, kwargs = cls.get_config_dict(finetuned_delta_path, **kwargs) if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warn( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " @@ -141,7 +119,7 @@ class BaseDeltaConfig(PushToHubMixin): return cls.from_dict(config_dict, **kwargs) - def save_finetuned(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs): + def save_finetuned(self, save_directory: Union[str, os.PathLike], **kwargs): """ Save a configuration object to the directory :obj:`save_directory`, so that it can be re-loaded using the :meth:`BaseDeltaConfig.from_finetuned` class method. @@ -153,22 +131,15 @@ class BaseDeltaConfig(PushToHubMixin): the Hugging Face model hub after saving it. .. warning:: - 1. Will raise error if you haven't config a Huggingface Model Hub. - 2. Using ``push_to_hub=True`` will synchronize the repository you are pushing to with ``save_directory``, - which requires ``save_directory`` to be a local clone of the repo you are pushing to if it's an existing - folder. Pass along ``temp_dir=True`` to use a temporary directory instead. - kwargs: - Additional key word arguments passed along to the - `PushToHubMixin.push_to_hub `_ method. + 1. Will raise error if you haven't config a Huggingface Model Hub. + 2. Using ``push_to_hub=True`` will synchronize the repository you are pushing to with ``save_directory``, which requires ``save_directory`` to be a local clone of the repo you are pushing to if it's an existing folder. Pass along ``temp_dir=True`` to use a temporary directory instead. + + kwargs: Additional key word arguments. """ if os.path.isfile(save_directory): raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") - if push_to_hub: - commit_message = kwargs.pop("commit_message", None) - repo = self._create_or_get_repo(save_directory, **kwargs) - os.makedirs(save_directory, exist_ok=True) # If we save using the predefined names, we can load using `from_pretrained` output_config_file = os.path.join(save_directory, CONFIG_NAME) @@ -176,9 +147,6 @@ class BaseDeltaConfig(PushToHubMixin): self.to_json_file(output_config_file, use_diff=True) logger.info(f"Configuration saved in {output_config_file}") - if push_to_hub: - url = self._push_to_hub(repo, commit_message=commit_message) - logger.info(f"Configuration pushed to the hub in this commit: {url}") @classmethod def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "BaseDeltaConfig": @@ -202,6 +170,7 @@ class BaseDeltaConfig(PushToHubMixin): config_dict.pop(config_key) unused_config_keys.append(config_key) logger.warning(f"The following keys are not used by {cls}.__init__ function: {unused_config_keys}") + config = cls(**config_dict) @@ -215,7 +184,7 @@ class BaseDeltaConfig(PushToHubMixin): to_remove.append(key) for key in to_remove: kwargs.pop(key, None) - logger.info(f"Model config {config}") + logger.info(f"Model config\n{config}") if return_unused_kwargs: return config, kwargs @@ -224,96 +193,58 @@ class BaseDeltaConfig(PushToHubMixin): @classmethod def get_config_dict( - cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + cls, finetuned_delta_path: Union[str, os.PathLike], **kwargs ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """[NODOC] - From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a + From a ``finetuned_delta_path``, resolve to a dictionary of parameters, to be used for instantiating a [``PretrainedConfig``] using ``from_dict``. Parameters: - pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): + finetuned_delta_path (:obj:`str` or :obj:`os.PathLike`): The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. Returns: :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object. """ - cache_dir = kwargs.pop("cache_dir", None) - force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) - proxies = kwargs.pop("proxies", None) - use_auth_token = kwargs.pop("use_auth_token", None) - local_files_only = kwargs.pop("local_files_only", False) - revision = kwargs.pop("revision", None) + cache_dir = kwargs.get("cache_dir", None) + force_download = kwargs.get("force_download", False) + # resume_download = kwargs.pop("resume_download", False) + # proxies = kwargs.pop("proxies", None) + # use_auth_token = kwargs.pop("use_auth_token", None) + local_files_only = kwargs.get("local_files_only", False) + # revision = kwargs.pop("revision", None) # from_pipeline = kwargs.pop("_from_pipeline", None) - from_auto_class = kwargs.pop("_from_auto", False) + # from_auto_class = kwargs.pop("_from_auto", False) - user_agent = {"file_type": "config", "from_auto_class": from_auto_class} + # user_agent = {"file_type": "config", "from_auto_class": from_auto_class} # if from_pipeline is not None: # user_agent["using_pipeline"] = from_pipeline - if is_offline_mode() and not local_files_only: - logger.info("Offline mode: forcing local_files_only=True") + if os.environ.get("DELTACENTER_OFFLINE", '0') == '1': + logger.info("Delta Center offline mode!") local_files_only = True - pretrained_model_name_or_path = str(pretrained_model_name_or_path) - if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): - config_file = pretrained_model_name_or_path + finetuned_delta_path = str(finetuned_delta_path) + + if cache_dir is not None: + cached_finetuned_delta_path = os.path.join(cache_dir, finetuned_delta_path) else: - configuration_file = get_configuration_file( - pretrained_model_name_or_path, - revision=revision, - use_auth_token=use_auth_token, - local_files_only=local_files_only, - ) + cached_finetuned_delta_path = finetuned_delta_path + if os.path.isfile(cached_finetuned_delta_path): + local_files_only = True + elif os.path.isdir(cached_finetuned_delta_path): + # cached_finetuned_delta_path = os.path.join(cached_finetuned_delta_path, 'config.json') + local_files_only = True - if os.path.isdir(pretrained_model_name_or_path): - config_file = os.path.join(pretrained_model_name_or_path, configuration_file) - else: - config_file = hf_bucket_url( - pretrained_model_name_or_path, filename=configuration_file, revision=revision, mirror=None - ) - - try: - # Load from URL or cache if already cached - resolved_config_file = cached_path( - config_file, - cache_dir=cache_dir, - force_download=force_download, - proxies=proxies, - resume_download=resume_download, - local_files_only=local_files_only, - use_auth_token=use_auth_token, - user_agent=user_agent, - ) - # Load config dict - config_dict = cls._dict_from_json_file(resolved_config_file) - - except EnvironmentError as err: - logger.error(err) - msg = ( - f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n" - f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n" - f" (make sure '{pretrained_model_name_or_path}' is not a path to a local directory with something else, in that case)\n\n" - f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {CONFIG_NAME} file\n\n" - ) - - if revision is not None: - msg += f"- or '{revision}' is a valid git identifier (branch name, a tag name, or a commit id) that exists for this model name as listed on its model page on 'https://huggingface.co/models'\n\n" - - raise EnvironmentError(msg) - - except (json.JSONDecodeError, UnicodeDecodeError): - msg = ( - f"Couldn't reach server at '{config_file}' to download configuration file or " - "configuration file is not a valid JSON file. " - f"Please check network or file content here: {resolved_config_file}." - ) - raise EnvironmentError(msg) - - if resolved_config_file == config_file: - logger.info(f"loading configuration file {config_file}") - else: - logger.info(f"loading configuration file {config_file} from cache at {resolved_config_file}") + # if local_files_only: + # config_dict = cls._dict_from_json_file(cached_finetuned_delta_path) + if not local_files_only or force_download: + from .utils.delta_center import download as dcdownload + # try to download from DeltaCenter + cached_finetuned_delta_path = dcdownload(finetuned_delta_path, force_download=force_download, cache_dir=cache_dir) + kwargs['force_download'] = False # Has been downloaded, not more forcing + cached_finetuned_delta_path = os.path.join(cached_finetuned_delta_path, 'config.json') + config_dict = cls._dict_from_json_file(cached_finetuned_delta_path) return config_dict, kwargs @classmethod @@ -400,8 +331,6 @@ class BaseDeltaConfig(PushToHubMixin): def to_dict(self) -> Dict[str, Any]: """ Serializes this instance to a Python dictionary. - Returns: - :obj:`dict`: Dictionary of all the attributes that make up this configuration instance. """ output = copy.deepcopy(self.__dict__) if hasattr(self.__class__, "model_type"): @@ -427,53 +356,6 @@ class BaseDeltaConfig(PushToHubMixin): -def get_configuration_file( - path_or_repo: Union[str, os.PathLike], - revision: Optional[str] = None, - use_auth_token: Optional[Union[bool, str]] = None, - local_files_only: bool = False, -) -> str: - """ - Get the configuration file to use for this version of transformers. - Args: - path_or_repo (`:obj:str` or `:obj:os.PathLike`): - Can be either the id of a repo on huggingface.co or a path to a *directory*. - revision(`:obj:str`, *optional*, defaults to ``"main"``): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any - identifier allowed by git. - use_auth_token (:obj:`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token generated - when running ``transformers-cli login`` (stored in ``~/.huggingface``). - local_files_only (:obj:`bool`, *optional*, defaults to :obj:`False`): - Whether or not to only rely on local files and not to attempt to download any files. - Returns: - :obj:`str`: The configuration file to use. - """ - # Inspect all files from the repo/folder. - all_files = get_list_of_files( - path_or_repo, revision=revision, use_auth_token=use_auth_token, local_files_only=local_files_only - ) - configuration_files_map = {} - for file_name in all_files: - search = _re_configuration_file.search(file_name) - if search is not None: - v = search.groups()[0] - configuration_files_map[v] = os.path.split(file_name)[-1] - available_versions = sorted(configuration_files_map.keys()) - # Defaults to FULL_CONFIGURATION_FILE and then try to look at some newer versions. - configuration_file = FULL_CONFIGURATION_FILE - # transformers_version_ = version.parse(transformers_version) - for v in available_versions: - # if version.parse(v) <= transformers_version_: - configuration_file = configuration_files_map[v] - # else: - # # No point going further since the versions are sorted. - # break - - return configuration_file - - if __name__ == "__main__": myconfig = BaseDeltaConfig.from_pretrained("../ckpts/lora/") myconfig.save_pretrained("../ckpts/lora.1/") diff --git a/opendelta/delta_models/adapter.py b/opendelta/delta_models/adapter.py index 4202c5a..bbb8bb4 100644 --- a/opendelta/delta_models/adapter.py +++ b/opendelta/delta_models/adapter.py @@ -11,6 +11,8 @@ from opendelta import BaseDeltaConfig import opendelta.utils.logging as logging import numpy as np from opendelta import global_setting +from dataclasses import dataclass, field + logger = logging.get_logger(__name__) @@ -20,10 +22,18 @@ class InterFaceMixin: self._reverse_axis_order = np.argsort(self._axis_order).tolist() def _transpose(self, tensor): - return tensor.permute(*self._axis_order) + if tensor.dim() == 3: + return tensor.permute(*self._axis_order) + else: + return tensor + + def _reverse_transpose(self, tensor): - return tensor.permute(*self._reverse_axis_order).contiguous() + if tensor.dim() == 3: + return tensor.permute(*self._reverse_axis_order).contiguous() + else: + return tensor def _convert_data_type(self, tensor): self._data_type_record = tensor.dtype @@ -35,6 +45,8 @@ class InterFaceMixin: + + class AdapterLayer(nn.Module, InterFaceMixin): r"""A layer of adapter tuning module. """ @@ -144,7 +156,6 @@ class AdapterConfig(BaseDeltaConfig): self, bottleneck_dim: Optional[int]=24, non_linearity: Optional[str]='gelu_new', - sequential: Optional[str] = True, **kwargs ): super().__init__(**kwargs) @@ -175,27 +186,21 @@ class AdapterModel(DeltaBase): backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. bottleneck_dim (:obj:`int`): The dimension of the adapter's bottleneck. non_linearity (:obj:`str`): The non linearity of the adapter. - sequential (:obj:`str`): Whether insert the adapter in a sequential manner, as opposed to a parallel manner. - See `Towards a Unified View of Parameter-Efficient Transfer Learning `_ - for detail. - modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only - the implemented ones) - unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen - together with the prefix parameters. - common_structure (:obj:`bool`): whether using name-based addressing with a common structure mapping. + modified_modules (:obj:`List[str]`): modules to add adapter after them. + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen together with the adapter parameters. + common_structure (:obj:`bool`): whether using name-based addressing witha common structure mapping. """ config_class = AdapterConfig delta_type = "adapter" - default_modified_modules = ["attn", "ff"] + default_modified_modules = ["attn@.proj@", "ff@.w2@"] + _need_pseudo_data = True def __init__(self, backbone_model: nn.Module, bottleneck_dim: Optional[int]=24, non_linearity: Optional[str]='gelu_new', - sequential: Optional[str] = True, - modified_modules: Optional[List[str]] = None, - exclude_modules: Optional[List[str]] = None, - unfrozen_modules: Optional[List[str]] = None, + modified_modules: Optional[bool] = None, + unfrozen_modules: Optional[bool] = None, common_structure: Optional[bool] = None, interactive_modify: Optional[Union[bool, int]] = False, ): @@ -217,19 +222,8 @@ class AdapterModel(DeltaBase): self.add_all_delta_to_backbone(self.backbone_model, self.modified_modules, ) - - - def add_all_delta_to_backbone(self, - module: nn.Module, - modified_modules: List[str], - ) -> nn.Module: - for key, _ in module.named_modules(): - if self.find_key(key, modified_modules): - self.update_module(module, key) - self._pseudo_data_to_instantiate(module) - self.mark_as_delta() - return module - + + def update_module(self, module: nn.Module, key: str): _, _, ref = self.find_module(module, key) adapterlayer = self.new_module_like(ref) diff --git a/opendelta/delta_models/bitfit.py b/opendelta/delta_models/bitfit.py index 29c9194..8d26997 100644 --- a/opendelta/delta_models/bitfit.py +++ b/opendelta/delta_models/bitfit.py @@ -75,6 +75,16 @@ class BiasLayer(nn.Module): raise TypeError return output +framework_map = {} +framework_map['hf'] = { + "linear": nn.Linear, + "layer_norm": nn.LayerNorm, +} + +framework_map['bmt'] = { + "linear": model_center.layer.Linear, + "layer_norm", model_center.layer.LayerNorm, +} class BitFitModel(DeltaBase): @@ -113,7 +123,8 @@ class BitFitModel(DeltaBase): config_class = BitFitConfig delta_type = "bitfit" - default_modified_modules = ["attn", "ff", "layer_norm","lm_head.proj"] # modify all the bias parameter in attention and feed-forward layer. + default_modified_modules = ["attn@", "ff@", "layer_norm@","lm_head@.proj@"] # modify all the bias parameter in attention and feed-forward layer. + _need_pseudo_data = False def __init__(self, backbone_model: nn.Module, modified_modules: Optional[List[str]] = None, @@ -121,6 +132,7 @@ class BitFitModel(DeltaBase): unfrozen_modules: Optional[List[str]] = None, common_structure: Optional[bool] = None, interactive_modify: Optional[Union[bool, int]] = False, + framework_type: Optional[str] = "hf", ): DeltaBase.__init__(self, backbone_model, @@ -129,6 +141,7 @@ class BitFitModel(DeltaBase): unfrozen_modules=unfrozen_modules, common_structure=common_structure, interactive_modify=interactive_modify, + framework_type=framework_type, ) arg_names = get_arg_names_inside_func(self.__init__) for arg_name in arg_names: @@ -139,8 +152,7 @@ class BitFitModel(DeltaBase): self.delta_modules = nn.ModuleList() self.add_all_delta_to_backbone(self.backbone_model, - self.modified_modules, - ) + self.modified_modules) def update_module(self, module: nn.Module, key: str): @@ -153,38 +165,58 @@ class BitFitModel(DeltaBase): ): if is_leaf_module(module): # if it is a leaf module, add bias to it regardless of its type. - if self.check_linear(module): - self.add_bias_to_linear(module) + # if self.check_linear(module): + # self.add_bias_to_linear(module) + if self.check_linear(module) or self.check_layernorm(module, nn.LayerNorm): + self.add_bias_to_modules_have_bias_or_known_type(module) else: # for example, layer_norms, lm_heads. self.add_bias_to_others(module) else: - # for the non-leaf modules, by default it will add bias only to the linear submodules. for n, c in module.named_modules(): - if self.check_linear(c): - self.add_bias_to_linear(c) - else: - pass + self.add_bias_to_modules_have_bias_or_known_type(c) + # if self.check_linear(c): + # self.add_bias_to_linear(c) + # else: + # pass - def add_bias_to_linear(self, c): - if c.bias is None: - bias = nn.Parameter(torch.empty(c.out_features), requires_grad=True) - self._reset_bias_parameters(c) - try: - import bmtrain as bmt - bias = bmt.BMTrainModelWrapper(bias) - except: - pass - c.register_parameter('bias', bias) - self.delta_params.append(bias) - else: + # def add_bias_to_linear(self, c): + # if c.bias is None: + # bias = nn.Parameter(torch.empty(c.out_features), requires_grad=True) + # self._reset_bias_parameters(c) + # try: + # import bmtrain as bmt + # bias = bmt.BMTrainModelWrapper(bias) + # except: + # pass + # c.register_parameter('bias', bias) + # self.delta_params.append(bias) + # else: + # self.add_bias_to_modules_have_bias_or_known_type(c) + + def add_bias_to_modules_have_bias_or_known_type(self, c): + '''If it has bias, unfreeze it. + If it doesn't have bias: if it is Linear of LN, add to it, else pass. + ''' + if 'bias' in [n for n,p in c.named_parameters()]: c.bias.requires_grad = True self.delta_params.append(c.bias) + else: + if self.check_linear(c) or isinstance(c): # todo: bmt layerNorm + bias = nn.Parameter(torch.empty(c.out_features), requires_grad=True) + + self._reset_bias_parameters(c) #? + try: + import bmtrain as bmt + bias = bmt.BMTrainModelWrapper(bias) + except: + pass + c.register_parameter('bias', bias) + self.delta_params.append(bias) - def add_bias_to_others(self, c): + def add_bias_to_others(self, c): # todo: bmtrain? new_bias = BiasLayer(dtype=get_dtype(c), device=get_device(c)) - self.insert_sequential_module(c, delta_module=new_bias, delta_name="bitfit") # name shouldn't be `bias` here, since - # the name `bias` is reserved for some module such as roberta's LayerNorm. + self.insert_sequential_module(c, delta_module=new_bias, delta_name="bitfit") # name shouldn't be `bias` here, since the name `bias` is reserved for some module such as roberta's LayerNorm. self.delta_modules.append(new_bias) def check_linear(self, m): @@ -198,6 +230,18 @@ class BitFitModel(DeltaBase): except: pass return False + + def check_layernorm(self, m): + if isinstance(m, nn.LayerNorm): + return True + else: + try: + from model_center.layer import LayerNorm + if isinstance(m, LayerNorm): + return True + except: + pass + return False @staticmethod diff --git a/opendelta/delta_models/compacter.py b/opendelta/delta_models/compacter.py index 86e2799..ca88bf2 100644 --- a/opendelta/delta_models/compacter.py +++ b/opendelta/delta_models/compacter.py @@ -217,7 +217,8 @@ class CompacterModel(DeltaBase): """ config_class = CompacterConfig delta_type = "compacter" - default_modified_modules = ["attn", "ff"] + default_modified_modules = ["attn@.proj@", "ff@.w2@"] + _need_pseudo_data = True def __init__(self, backbone_model, modified_modules: Optional[List[str]] = None, @@ -267,16 +268,16 @@ class CompacterModel(DeltaBase): ) - def add_all_delta_to_backbone(self, - module: nn.Module, - modified_modules: List[str], - ) -> nn.Module: - for key, _ in module.named_modules(): - if self.find_key(key, modified_modules): - self.update_module(module, key) - self._pseudo_data_to_instantiate(module) - self.mark_as_delta() - return module + # def add_all_delta_to_backbone(self, + # module: nn.Module, + # modified_modules: List[str], + # ) -> nn.Module: + # for key, _ in module.named_modules(): + # if self.find_key(key, modified_modules): + # self.update_module(module, key) + # self._pseudo_data_to_instantiate(module) + # self.mark_as_delta() + # return module def update_module(self, module: nn.Module, key: str): _, _, ref = self.find_module(module, key) diff --git a/opendelta/delta_models/lora.py b/opendelta/delta_models/lora.py index 3fe9504..83bd65a 100644 --- a/opendelta/delta_models/lora.py +++ b/opendelta/delta_models/lora.py @@ -3,10 +3,10 @@ from typing import Optional, Union from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func from opendelta.utils.name_based_addressing import * from opendelta.basemodel import DeltaBase -from transformers.models.t5 import T5ForConditionalGeneration import torch.nn as nn from opendelta import BaseDeltaConfig import math +from dataclasses import dataclass, field class LowRankLinear(nn.Module): # ------------------------------------------------------------------------------------------ @@ -40,6 +40,11 @@ class LowRankLinear(nn.Module): def forward(self, x): return (self.lora_dropout(x) @ self.lora_A.T @ self.lora_B.T) * self.scaling +@dataclass +class LoraArguments: + r: int = 8 + lora_alpha: int = 16 + lora_dropout: float = 0.0 class LoraConfig(BaseDeltaConfig): r""" @@ -65,16 +70,17 @@ class LoraModel(DeltaBase): Thanks for their `loralib `_. .. note:: + In our implementation, we did not use loralib.linear to replace the linear layer of the backbone model. Instead, we insert a parallel module into the backbone. - In other words, we treat :math:`(W + A^TB) X` as :math:`WX+ A^TBX`, and insert the :math:`A^TBX` as a parallel insertion module. - If you want to use the original implementation, please refer to `lora_old.py` + In other words, we treat :math:`(W + A^TB) X` as :math:`WX+ A^TBX`, and insert the :math:`A^TBX` as a parallel insertion module. If you want to use the original implementation, please refer to `lora_old.py` class attributes: - - default_modified_modules = ['attn.q', 'attn.v'] According to the paper, they modify q and v matrix in the - attention layer. However, other linears can also be modified, and may lead to better performance. + + - default_modified_modules = ['attn.q', 'attn.v'] According to the paper, they modify q and v matrix in the attention layer. However, other linears can also be modified, and may lead to better performance. .. note:: + modified_modules should point to linear layer. We currently don't support broadcast to all linears in a module's child modules. @@ -96,7 +102,8 @@ class LoraModel(DeltaBase): config_class = LoraConfig delta_type = "lora" - default_modified_modules = ['attn.q', 'attn.v'] + default_modified_modules = ['attn@.q@', 'attn@.v@'] + _need_pseudo_data = False def __init__(self, backbone_model: nn.Module, lora_r=8, diff --git a/opendelta/delta_models/low_rank_adapter.py b/opendelta/delta_models/low_rank_adapter.py index 210ade2..5946331 100644 --- a/opendelta/delta_models/low_rank_adapter.py +++ b/opendelta/delta_models/low_rank_adapter.py @@ -153,7 +153,8 @@ class LowRankAdapterModel(DeltaBase): config_class = LowRankAdapterConfig delta_type = "low_rank_adapter" - default_modified_modules = ['attn', 'ff'] + default_modified_modules = ["attn@.proj@", "ff@.w2@"] + _need_pseudo_data = True def __init__(self, backbone_model: nn.Module, reduction_factor = 32, @@ -186,16 +187,16 @@ class LowRankAdapterModel(DeltaBase): ) - def add_all_delta_to_backbone(self, - module: nn.Module, - modified_modules: List[str], - ) -> nn.Module: - for key, _ in module.named_modules(): - if self.find_key(key, modified_modules): - self.update_module(module, key) - self._pseudo_data_to_instantiate(module) - self.mark_as_delta() - return module + # def add_all_delta_to_backbone(self, + # module: nn.Module, + # modified_modules: List[str], + # ) -> nn.Module: + # for key, _ in module.named_modules(): + # if self.find_key(key, modified_modules): + # self.update_module(module, key) + # self._pseudo_data_to_instantiate(module) + # self.mark_as_delta() + # return module def update_module(self, module: nn.Module, key: str): _, _, ref = self.find_module(module, key) diff --git a/opendelta/delta_models/parallel_adapter.py b/opendelta/delta_models/parallel_adapter.py new file mode 100644 index 0000000..1024394 --- /dev/null +++ b/opendelta/delta_models/parallel_adapter.py @@ -0,0 +1,199 @@ +from functools import partial +from random import random +from typing import Optional, Union +from opendelta.utils.signature import get_arg_names_inside_func +from opendelta.utils.name_based_addressing import * +from opendelta.utils.cuda import get_device +from opendelta.basemodel import DeltaBase +import torch.nn as nn +import torch +from opendelta.delta_models.layers.activations import Activations +from opendelta import BaseDeltaConfig +import opendelta.utils.logging as logging +logger = logging.get_logger(__name__) + +class ParallelAdapterLayer(nn.Module): + r"""A layer of adapter tuning module. + """ + layer_count = 0 + + @classmethod + def count_layer(cls): + cls.layer_count += 1 + + @classmethod + def get_layer_count(cls): + return cls.layer_count + + def __init__(self, bottleneck_dim=24, non_linearity='gelu_new', scaled=1, device=None): + super().__init__() + self.bottleneck_dim = bottleneck_dim + self.device = device + self.instantiated = False + self.non_linearity = non_linearity + self.scaled = scaled + + self.layer_id = ParallelAdapterLayer.get_layer_count() + ParallelAdapterLayer.count_layer() + + + def instantiate(self, hidden_dim): + self.modulelist = nn.Sequential() + self.modulelist.add_module("down_proj",nn.Linear(hidden_dim, self.bottleneck_dim, device=self.device)) + + # select non-linearity + self.modulelist.add_module("non_linear", Activations(self.non_linearity.lower())) + + self.modulelist.add_module("up_proj", nn.Linear(self.bottleneck_dim, self.hidden_dim, device=self.device)) + + self.instantiated = True + # initialize the weight, which is important for fast convergence and better performance. + self.apply(self._init_weight) + + def _init_weight(self, module): + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=0.01) + if module.bias is not None: + module.bias.data.zero_() + + + def pre_forward(self, *args, **kwargs): + r""" Get the hidden_states from the PLM's layer output, pass it into the adapter, + then combined with the main hidden_states. Finally pass it into the subsequent layer. + + """ + if isinstance(args, tuple): + hiddens = args[0] + elif isinstance(args, torch.Tensor): + hiddens = args + else: + raise TypeError + + + if not self.instantiated: + self.hidden_dim = hiddens.shape[-1] + logger.debug(f"Got hidden dim hidden_dim {self.hidden_dim}") + self.instantiate(hidden_dim=self.hidden_dim) + + + self.adapter_output = self.modulelist(hiddens) * self.scaled + return args, kwargs + + def post_forward(self, output, **kwargs): + if isinstance(output, tuple): + hidden = output[0] + elif isinstance(output, torch.Tensor): + hidden = output + else: + raise TypeError + + modified_output = self.adapter_output + hidden + if isinstance(output, tuple): + output = (modified_output,) + output[1:] + elif isinstance(output, torch.Tensor): + output = modified_output + else: + raise TypeError + return output + + + +class ParallelAdapterConfig(BaseDeltaConfig): + r""" + This is the configuration class to store the configuration of a :py:class:`~ParallelAdapterModel` + + """ + def __init__( + self, + bottleneck_dim: Optional[int]=24, + non_linearity: Optional[str]='gelu_new', + scaled: Optional[float]=1., + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + + + +class ParallelAdapterModel(DeltaBase): + r""" The implementation of Parallel Adapter(`TOWARDS A UNIFIED VIEW OF PARAMETER-EFFICIENT TRANSFER LEARNING `_ ) . + Add adapter to the designated ``modified_modules``. In parallel paradigm, The modules' output is then passed into the adapter's + post_forward. + + .. note:: + We **assume** the output of the modified module is the hidden state or a tuple where hidden state is the + first element. This is true for most PLMs. However, we admit that currently it's not rigorous, We will improve + it in the next version. Currently, if you encount an error here for you backbone, you can modify the code to + get the hidden state. + + class attributes: + - default_modified_modules = ["attn", "ff"] According to the Adapter paper, we add adapter to the attention layer + and feed forward layer. + - delta_type = "adapter" + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + bottleneck_dim (:obj:`int`): The dimension of the adapter's bottleneck. + non_linearity (:obj:`str`): The non linearity of the adapter. + modified_modules (:obj:`List[str]`): modules to add parallel adapter. Must be paired and have the save order in layer. For examples, ["attn", "attn", "ff.w1", "ff.w2"] add one parallel adapter from attn's input to attn's output, and another one from ff.w1's input to ff.w2's output. + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen together with the parallel adapter parameters. + common_structure (:obj:`bool`): whether using name-based addressing witha common structure mapping. + + """ + config_class = ParallelAdapterConfig + delta_type = "parallel_adapter" + default_modified_modules = ["attn@", "attn@", "ff@.w1@", "ff@.w2@"] + # default_modified_modules = ["attn", "attn", "ff.w1", "ff.w2"] + _need_pseudo_data = True + def __init__(self, + backbone_model: nn.Module, + bottleneck_dim: Optional[int]=24, + non_linearity: Optional[str]='gelu_new', + modified_modules: Optional[bool] = None, + exclude_modules: Optional[List[str]] = None, + unfrozen_modules: Optional[bool] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + ): + DeltaBase.__init__(self, + backbone_model, + modified_modules=modified_modules, + exclude_modules=exclude_modules, + unfrozen_modules=unfrozen_modules, + common_structure=common_structure, + interactive_modify=interactive_modify, + ) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + self.delta_modules = nn.ModuleList() + + self.ith = 0 + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + + def update_module(self, module: nn.Module, key: str): + _, _, ref = self.find_module(module, key) + if self.ith % 2 == 0: + adapterlayer = self.new_module_like(ref) + self.insert_module(ref, "before", delta_module=adapterlayer, delta_name="parallel_adapter") + if self.ith % 2 == 1 or self.modified_modules[self.ith] == self.modified_modules[self.ith + 1]: + adapterlayer = self.delta_modules[-1] + self.insert_module(ref, "after", delta_module=adapterlayer, delta_name="parallel_adapter") + self.ith |= 1 + self.ith += 1 + self.ith %= len(self.modified_modules) + + def new_module_like(self, module): + module_device = get_device(module) + adapterlayer = ParallelAdapterLayer(bottleneck_dim=self.bottleneck_dim, non_linearity=self.non_linearity, device=module_device) + self.delta_modules.append(adapterlayer) + return adapterlayer + \ No newline at end of file diff --git a/opendelta/delta_models/prefix.py b/opendelta/delta_models/prefix.py index f40ea93..f64df2c 100644 --- a/opendelta/delta_models/prefix.py +++ b/opendelta/delta_models/prefix.py @@ -515,7 +515,8 @@ class PrefixModel(DeltaBase): """ config_class = PrefixConfig delta_type = "prefix" - default_modified_modules = ['attn'] + default_modified_modules = ['attn@'] + _need_pseudo_data = True def __init__(self, backbone_model: nn.Module, prefix_token_num=6, @@ -610,7 +611,7 @@ class PrefixModel(DeltaBase): module_device = get_device(module) prefixlayer = PrefixLayerBart(prefix_token_num=self.prefix_token_num, num_heads=module.num_heads ,device=module_device) else: - raise NotImplementedError(type(module)) + raise NotImplementedError(f"We haven't implement Prefix Tuning Layer for {module.__class__.__name__}. Please refer to https://opendelta.readthedocs.io/en/latest/notes/faq.html for detail.") return prefixlayer, module diff --git a/opendelta/delta_models/soft_prompt.py b/opendelta/delta_models/soft_prompt.py index f62b46d..ff2346d 100644 --- a/opendelta/delta_models/soft_prompt.py +++ b/opendelta/delta_models/soft_prompt.py @@ -145,24 +145,23 @@ class SoftPromptModel(DeltaBase): you set ``n_token`` tokens template before the will give the same result. Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. soft_token_num (:obj:`int`, *optional*): num of new tokens to add in the front of the input. init_range (:obj:`float`, *optional*): If initialize new tokens randomly, the random range of uniform distribution. - token_init (:obj:`bool`, *optional*, default to :obj:`True`): Whether to initialize the new tokens with tokens of the plm - other_expand_ids (:obj:`dict`, *optional*, default to `{"attention_mask":1, "token_type_ids":0}`) The name of - other tokens and its default value that expand along with the input sequence. For example, when - you prepend 100 tokens to the input_ids, the attention_mask should be extended, and the token_type_ids should - be extended as well. - modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only - the implemented ones) - unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen - together with the prefix parameters. + token_init (:obj:`bool`, *optional*, default to :obj:`True`): Whether to initialize the new tokens with tokens of the PLM. + other_expand_ids (:obj:`dict`, *optional*, default to ``{'attention_mask':1, 'token_type_ids':0}``): The name of other tokens and its default value that expand along with the input sequence. For example, when you prepend 100 tokens to the input_ids, the attention_mask should be extended, and the token_type_ids should be extended as well. + modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only the implemented ones). + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen together with the prefix parameters. common_structure (:obj:`bool`): whether using name-based addressing with a common structure mapping. + """ + config_class = SoftPromptConfig delta_type = "soft_prompt" default_modified_modules = ["root"] # not used + _need_pseudo_data = False def __init__(self, backbone_model: nn.Module, soft_token_num=100, @@ -211,9 +210,7 @@ class SoftPromptModel(DeltaBase): def update_module(self): soft_prompt_layer = self.new_module_like(self.raw_embedding) - self.insert_sequential_module(self.backbone_model.get_encoder() if self.backbone_model.config.is_encoder_decoder else self.backbone_model, - delta_module=soft_prompt_layer, - delta_name="soft_prompt_layer" ) + self.insert_sequential_module(self.backbone_model.get_encoder() if self.backbone_model.config.is_encoder_decoder else self.backbone_model,delta_module=soft_prompt_layer,delta_name="soft_prompt_layer" ) def new_module_like(self, module): module_device = get_device(module) diff --git a/opendelta/utils/common_structures/__init__.py b/opendelta/utils/common_structures/__init__.py new file mode 100644 index 0000000..6c98f94 --- /dev/null +++ b/opendelta/utils/common_structures/__init__.py @@ -0,0 +1,24 @@ +CoreMappings = {} + +import importlib +import os +import sys + +cur_path = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, cur_path) + +filelist = os.listdir(cur_path) + +for file in filelist: + if not file.endswith(".py"): + continue + elif file.endswith("__init__.py"): + continue + else: + filename = file[:-3] + mappings = importlib.import_module(f".utils.common_structures.{filename}", "opendelta") + CoreMappings.update(mappings.Mappings) + + + + \ No newline at end of file diff --git a/opendelta/utils/common_structures/bert.py b/opendelta/utils/common_structures/bert.py new file mode 100644 index 0000000..b58a255 --- /dev/null +++ b/opendelta/utils/common_structures/bert.py @@ -0,0 +1,28 @@ +Mappings = {} + +Mappings['BertModel'] = { + "embeddings.word_embeddings": {"__name__":"embeddings"}, + "embeddings.position_embeddings": {"__name__":""}, + "embeddings.token_type_embeddings": {"__name__":""}, + "embeddings.LayerNorm": {"__name__":""}, + "encoder": {"__name__":"encoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "attention": {"__name__":"attn", + "self.query": {"__name__":"q"}, + "self.key": {"__name__":"k"}, + "self.value": {"__name__":"v"}, + "output.dense": {"__name__":"proj"}, + "output.LayerNorm": {"__name__":"layer_norm"}, + }, + "output": {"__name__":"ff", + "dense": {"__name__":"w2"}, + "LayerNorm": {"__name__":"layer_norm"} + }, + "intermediate": {"__name__":"ff", + "dense": {"__name__":"w1"}, + } + } + } + }, +} diff --git a/opendelta/utils/common_structures/debertav2.py b/opendelta/utils/common_structures/debertav2.py new file mode 100644 index 0000000..727d03b --- /dev/null +++ b/opendelta/utils/common_structures/debertav2.py @@ -0,0 +1,31 @@ + +Mappings = {} + +Mappings['DebertaV2Model'] = { + "embeddings.word_embeddings": {"__name__":"embeddings"}, + "embeddings.LayerNorm": {"__name__":""}, + "encoder": {"__name__":"encoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "attention": {"__name__":"attn", + "self.query_proj": {"__name__":"q"}, + "self.key_proj": {"__name__":"k"}, + "self.value_proj": {"__name__":"v"}, + "output.dense": {"__name__":"proj"}, + "output.LayerNorm": {"__name__":"layer_norm"}, + }, + "output": {"__name__":"ff", + "dense": {"__name__":"w2"}, + "LayerNorm": {"__name__":"layer_norm"} + }, + "intermediate.dense": {"__name__":"ff.w1"}, + } + }, + "rel_embeddings": {"__name__": ""}, + "LayerNorm": {"__name__": ""}, + "conv": {"__name__": "", + "conv": {"__name__": ""}, + "LayerNorm": {"__name__": ""} + } + }, +} \ No newline at end of file diff --git a/opendelta/utils/common_structures/gpt2.py b/opendelta/utils/common_structures/gpt2.py new file mode 100644 index 0000000..d486187 --- /dev/null +++ b/opendelta/utils/common_structures/gpt2.py @@ -0,0 +1,22 @@ + +Mappings = {} + +Mappings['GPT2Model'] = { + "wte": {"__name__":"embeddings"}, + "wpe": {"__name__":""}, + "h": {"__name__":"decoder.block", + "$": {"__name__":"$", + "attn": {"__name__":"attn", + "c_attn": {"__name__":"q,k,v"}, + "c_proj": {"__name__":"proj"}, + }, + "ln_1": {"__name__":"attn.layer_norm"}, + "mlp":{ "__name__": "ff", + "c_fc": {"__name__":"w1"}, + "c_proj": {"__name__":"w2"} + }, + "ln_2": {"__name__":"ff.layer_norm"}, + }, + }, + "ln_f": {"__name__":"decoder.layer_norm"}, +} \ No newline at end of file diff --git a/opendelta/utils/common_structures/opt.py b/opendelta/utils/common_structures/opt.py new file mode 100644 index 0000000..c1092e7 --- /dev/null +++ b/opendelta/utils/common_structures/opt.py @@ -0,0 +1,25 @@ + + +Mappings = {} +Mappings['OPTModel'] = { + "decoder.embed_tokens": {"__name__":"embeddings"}, + "decoder.embed_positions": {"__name__":""}, + "decoder.project_out": {"__name__":""}, + "decoder.project_in": {"__name__":""}, + "decoder": {"__name__":"decoder", + "layers": {"__name__":"block", + "$": {"__name__":"$", + "self_attn": {"__name__":"attn", + "q_proj": {"__name__":"q"}, + "k_proj": {"__name__":"k"}, + "v_proj": {"__name__":"v"}, + "out_proj": {"__name__":"proj"} + }, + "self_attn_layer_norm": {"__name__":"layer_norm"}, + "fc1": {"__name__":"ff.w1", "__virtual__": "ff", "__order__": "first"}, + "fc2": {"__name__":"ff.w2","__virtual__": "ff", "__order__": "last"}, + "final_layer_norm": {"__name__":"layer_norm"}, + } + } + } +} \ No newline at end of file diff --git a/opendelta/utils/common_structures/roberta.py b/opendelta/utils/common_structures/roberta.py new file mode 100644 index 0000000..94ce813 --- /dev/null +++ b/opendelta/utils/common_structures/roberta.py @@ -0,0 +1,27 @@ +Mappings = {} + +Mappings['RobertaModel'] = {"embeddings.word_embeddings": {"__name__":"embeddings"}, + "embeddings.position_embeddings": {"__name__":""}, + "embeddings.token_type_embeddings": {"__name__":""}, + "embeddings.LayerNorm": {"__name__":""}, + "encoder": {"__name__":"encoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "attention": {"__name__":"attn", + "self.query": {"__name__":"q"}, + "self.key": {"__name__":"k"}, + "self.value": {"__name__":"v"}, + "output.dense": {"__name__":"proj"}, + "output.LayerNorm": {"__name__":"layer_norm"}, + }, + "output": {"__name__":"ff", + "dense": {"__name__":"w2"}, + "LayerNorm": {"__name__":"layer_norm"} + }, + "intermediate": {"__name__":"ff", + "dense": {"__name__":"w1"}, + } + } + } + }, +} \ No newline at end of file diff --git a/opendelta/utils/common_structures/t5.py b/opendelta/utils/common_structures/t5.py new file mode 100644 index 0000000..8150fe2 --- /dev/null +++ b/opendelta/utils/common_structures/t5.py @@ -0,0 +1,71 @@ +Mappings = {} + +t5encoder = {"__name__":"encoder", + "embed_tokens": {"__name__":"embeddings"}, + "block": {"__name__":"block", + "$": {"__name__":"$", + "layer.0": {"__name__":"attn", + "SelfAttention.q": {"__name__":"q"}, + "SelfAttention.k": {"__name__":"k"}, + "SelfAttention.v": {"__name__":"v"}, + "SelfAttention.o": {"__name__":"proj"}, + "SelfAttention.relative_attention_bias": {"__name__":""}, + "layer_norm": {"__name__":"layer_norm"}, + }, + "layer.1": {"__name__":"ff", + "DenseReluDense.wi": {"__name__":"w1"}, + "layer_norm": {"__name__":"layer_norm"}, + "DenseReluDense.wo": {"__name__":"w2"}, + } + } + }, + "final_layer_norm": {"__name__":"layer_norm"}, + } + +t5decoder = {"__name__":"decoder", + "embed_tokens": {"__name__":"embeddings"}, + "block": {"__name__":"block", + "$": {"__name__":"$", + "layer.0": {"__name__":"attn", + "SelfAttention.q": {"__name__":"q"}, + "SelfAttention.k": {"__name__":"k"}, + "SelfAttention.v": {"__name__":"v"}, + "SelfAttention.o": {"__name__":"proj"}, + "SelfAttention.relative_attention_bias": {"__name__":""}, + "layer_norm": {"__name__":"layer_norm"}, + }, + "layer.1": {"__name__":"crossattn", + "EncDecAttention.q": {"__name__":"q"}, + "EncDecAttention.k": {"__name__":"k"}, + "EncDecAttention.v": {"__name__":"v"}, + "EncDecAttention.o": {"__name__":"proj"}, + "layer_norm": {"__name__":"layer_norm"}, + }, + "layer.2": {"__name__":"ff", + "DenseReluDense.wi": {"__name__":"w1"}, + "layer_norm": {"__name__":"layer_norm"}, + "DenseReluDense.wo": {"__name__":"w2"}, + } + } + }, + "final_layer_norm": {"__name__":"layer_norm"}, + } + + + +Mappings['T5Model'] = { + "shared": {"__name__":"embeddings"}, + "encoder": t5encoder, + "decoder": t5decoder, +} + +Mappings['T5ForConditionalGeneration'] = { + "shared": {"__name__":"embeddings"}, + "encoder": t5encoder, + "decoder": t5decoder, +} + +Mappings['T5EncoderModel'] = { + "shared": {"__name__":"embeddings"}, + "encoder": t5encoder, +} \ No newline at end of file diff --git a/opendelta/utils/data_parallel.py b/opendelta/utils/data_parallel.py index ca0c4c0..8c32297 100644 --- a/opendelta/utils/data_parallel.py +++ b/opendelta/utils/data_parallel.py @@ -4,26 +4,50 @@ from opendelta.utils.decorate import decorate from collections import OrderedDict +def sequential_caller(_org_func, org_module, delta_name, *args, **kwargs): + args = args[1:] # the first argument here is ``self`` + delta_module = getattr(org_module, delta_name) + if hasattr(delta_module, "pre_forward"): + args, kwargs = delta_module.pre_forward(*args, **kwargs) + ret = _org_func(*args, **kwargs) + if hasattr(delta_module, "post_forward"): + ret = delta_module.post_forward(ret) + return ret + +def before_caller(_org_func, org_module, delta_name, *args, **kwargs): + args = args[1:] # the first argument here is ``self`` + delta_module = getattr(org_module, delta_name) + if hasattr(delta_module, "pre_forward"): + args, kwargs = delta_module.pre_forward(*args, **kwargs) + ret = _org_func(*args, **kwargs) + return ret + +def after_caller(_org_func, org_module, delta_name, *args, **kwargs): + args = args[1:] # the first argument here is ``self`` + delta_module = getattr(org_module, delta_name) + ret = _org_func(*args, **kwargs) + if hasattr(delta_module, "post_forward"): + ret = delta_module.post_forward(ret) + return ret + +def parallel_caller(_org_func, org_module, delta_name, *args, **kwargs): + args = args[1:] # the first argument here is ``self`` + delta_module = getattr(org_module, delta_name) + ret_1 = _org_func(*args, **kwargs) + ret_2 = delta_module.forward(*args, **kwargs) + return ret_1 + ret_2 + +caller_map = { + "sequential": sequential_caller, + "parallel": parallel_caller, + "before": before_caller, + "after": after_caller, +} + def new_replicate_for_data_parallel(self): r""" self is the parent module. """ # rewrite the replicate in DataParallel. - def _sequential_caller(_org_func, org_module, delta_name, *args, **kwargs): - args = args[1:] # the first argument here is ``self`` - delta_module = getattr(org_module, delta_name) - if hasattr(delta_module, "pre_forward"): - args, kwargs = delta_module.pre_forward(*args, **kwargs) - ret = _org_func(*args, **kwargs) - if hasattr(delta_module, "post_forward"): - ret = delta_module.post_forward(ret) - return ret - - def _parallel_caller(_org_func, org_module, delta_name, *args, **kwargs): - args = args[1:] # the first argument here is ``self`` - delta_module = getattr(org_module, delta_name) - ret_1 = _org_func(*args, **kwargs) - ret_2 = delta_module.forward(*args, **kwargs) - return ret_1 + ret_2 replica = self.__new__(type(self)) org_forward = replica.forward replica.__dict__ = self.__dict__.copy() @@ -33,10 +57,9 @@ def new_replicate_for_data_parallel(self): for _delta_info in self._delta_infos: if _delta_info['state'] == 'on': - if _delta_info['method'] == "insert_sequential": - new_forward = decorate(replica.forward, _sequential_caller, extras=(replica, _delta_info['delta_name']), kwsyntax=True) - elif _delta_info['method'] == "insert_parallel": - new_forward = decorate(replica.forward, _parallel_caller, extras=(replica, _delta_info['delta_name']), kwsyntax=True) + if _delta_info['method'] in caller_map.keys(): + caller = caller_map[_delta_info['method']] + new_forward = decorate(replica.forward, caller, extras=(replica, _delta_info['delta_name']), kwsyntax=True) else: raise NotImplementedError(f"data_parallel for _delta_info['method']=='{_delta_info['method']}' is not supported") replica.__dict__['forward'] = new_forward.__get__(replica, type(replica)) diff --git a/opendelta/utils/delta_center.py b/opendelta/utils/delta_center.py new file mode 100644 index 0000000..9bf185d --- /dev/null +++ b/opendelta/utils/delta_center.py @@ -0,0 +1,10 @@ +from DeltaCenter import OssClient +from .file_utils import default_cache_path + + +def download(finetuned_delta_path, cache_dir=None, force_download=False): + if cache_dir is None: + cache_dir = default_cache_path + path_to_unzip_file = OssClient.download(finetuned_delta_path, dest=cache_dir, force_download=force_download) + return path_to_unzip_file + diff --git a/opendelta/utils/delta_hub.py b/opendelta/utils/delta_hub.py index d0da33e..504fc54 100644 --- a/opendelta/utils/delta_hub.py +++ b/opendelta/utils/delta_hub.py @@ -4,6 +4,8 @@ def create_hub_repo_name(root = "DeltaHub", dataset = None, delta_type = None, model_name_or_path = None, + center_value_only_tags = None, + center_key_value_tags = None ): r"""Currently, it's only a simple concatenation of the arguments. """ @@ -14,6 +16,9 @@ def create_hub_repo_name(root = "DeltaHub", repo_name.append(f"{model_name_or_path}") repo_name.append(f"{dataset}") + repo_name.extend(list(center_value_only_tags) if center_value_only_tags else [None]) + repo_name.extend([f"{k}-{v}" for k,v in center_key_value_tags.items()] if center_key_value_tags else [None]) + repo_name = "_".join(repo_name) repo_name = root+"/"+repo_name diff --git a/opendelta/utils/file_utils.py b/opendelta/utils/file_utils.py new file mode 100644 index 0000000..2e82768 --- /dev/null +++ b/opendelta/utils/file_utils.py @@ -0,0 +1,3 @@ +import os +default_cache_path = "{}/.cache/delta_center/".format(os.path.expanduser('~')) +WEIGHTS_NAME = 'pytorch_model.bin' \ No newline at end of file diff --git a/opendelta/utils/saving_loading_utils.py b/opendelta/utils/saving_loading_utils.py index eaeac58..4b9b92e 100644 --- a/opendelta/utils/saving_loading_utils.py +++ b/opendelta/utils/saving_loading_utils.py @@ -1,396 +1,434 @@ -from io import RawIOBase -from tarfile import HeaderError -from typing import Union, Optional, Callable +from typing import Dict, List, Union, Optional, Callable from opendelta.delta_configs import BaseDeltaConfig -from opendelta.utils.model_md5 import gen_model_hash +from opendelta.utils.model_md5 import gen_model_hash, gen_parameter_hash import torch import os from opendelta import logging import torch.nn as nn -from transformers.file_utils import ( - WEIGHTS_NAME, - PushToHubMixin, - is_offline_mode, - is_remote_url, - hf_bucket_url, - cached_path, - ) -from transformers.utils.dummy_pt_objects import PreTrainedModel -import hashlib +from DeltaCenter import OssClient +import yaml +from dataclasses import dataclass, field, fields +import datetime +from .file_utils import WEIGHTS_NAME logger = logging.get_logger(__name__) -class SaveLoadMixin(PushToHubMixin): + + +alternative_names = { + "train_tasks": ["train_tasks", "train_task", "task_name"], +} + + +@dataclass +class DeltaCenterArguments: + """ + The arguments that are used to distinguish between different delta models on the DeltaCenter + """ + name: str = field(default="", + metadata={"help": "The name of the delta model checkpoint"} + ) + backbone_model: str = field(default="", + metadata={"help": "The backbone model of the delta model"} + ) + backbone_model_path_public: str = field( + default = None, + metadata={"help": "Publicly available path (url) to pretrained model or model identifier from huggingface.co/models"} + ) + delta_type: str = field( + default=None, + metadata={"help": "the type of type model, e.g., adapter, lora, etc."} + ) + train_tasks: Optional[Union[List[str], str]]= field( + default=None, + metadata={"help": "the task(s) that the delta is trained on"} + ) + train_datasets: Optional[Union[List[str], str]]= field( + default=None, + metadata={"help": "the datasets(s) that the delta is trained on"} + ) + checkpoint_size: Optional[float] = field( + default=None, + metadata={"help": "the size of the checkpoint, in MB"} + ) + test_tasks: Optional[Union[List[str], str]] = field( + default=None, + metadata={"help": "the task(s) that the delta is tested on"} + ) + test_datasets: Optional[Union[List[str], str]] = field( + default=None, + metadata={"help": "the dataset(s) that the delta is tested on"} + ) + test_performance: Optional[float] = field( + default=None, + metadata={"help": "the performance of the model on the test set"} + ) + test_metrics: Optional[str] = field( + default=None, + metadata={"help": "the metrics used by the model"} + ) + trainable_ratio: Optional[float] = field( + default=None, + metadata={"help": "the ratio of trainable parameters in the model"} + ) + delta_ratio: Optional[float] = field( + default=None, + metadata={"help": "the ratio of delta parameters in the model"} + ) + usage: Optional[str] = field( + default="", + metadata={"help": "the usage code of the model"} + ) + license: Optional[str] = field( + default="apache-2.0", + metadata={"help": "the license of the model"} + ) + + + +class SaveLoadMixin: def add_configs_when_saving(self,): self.config.backbone_class = self.backbone_model.__class__.__name__ - self.config.backbone_checkpoint_name = os.path.split(self.backbone_model.config._name_or_path.strip("/"))[-1] + if hasattr(self.backbone_model, "config"): + self.config.backbone_checkpoint_name = os.path.split(self.backbone_model.config._name_or_path.strip("/"))[-1] self.config.backbone_hash = gen_model_hash(self.backbone_model) - def save_finetuned( self, - save_directory: Optional[Union[str, os.PathLike]] = "./output/", + finetuned_delta_path: Optional[Union[str, os.PathLike]] = "./delta_checkpoints/", save_config: bool = True, state_dict: Optional[dict] = None, save_function: Callable = torch.save, - push_to_hub: bool = False, - **kwargs, + push_to_dc: bool = False, + center_args: Optional[Union[DeltaCenterArguments, dict]] = dict(), + center_args_pool: Optional[dict] = dict(), + list_tags: Optional[List] = list(), + dict_tags: Optional[Dict] = dict(), + delay_push: bool = False, + test_result = None, + usage: Optional[str] = "", ): r""" Save a model and its configuration file to a directory, so that it can be re-loaded using the - :py:meth:`~DeltaBase.from_finetuned` class method. + :py:meth:`~DeltaBase.save_finetuned` class method. Arguments: - save_directory (:obj:`str` or :obj:`os.PathLike`): - Directory to which to save. Will be created if it doesn't exist. - save_config (:obj:`bool`, *optional*, defaults to :obj:`True`): - Whether or not to save the config of the model. Useful when in distributed training like TPUs and need - to call this function on all processes. In this case, set ``save_config=True`` only on the main process - to avoid race conditions. - state_dict (nested dictionary of :obj:`torch.Tensor`): - The state dictionary of the model to save. Will default to ``self.state_dict()``, but can be used to only - save parts of the model or if special precautions need to be taken when recovering the state dictionary - of a model (like when using model parallelism). - save_function (:obj:`Callable`): - The function to use to save the state dictionary. Useful on distributed training like TPUs when one - need to replace ``torch.save`` by another method. - push_to_hub (:obj:`bool`, *optional*, defaults to :obj:`False`): - Whether or not to push your model to the HuggingFace model hub after saving it. - - .. tip:: - - Using ``push_to_hub=True`` will synchronize the repository you are pushing to with ``save_directory``, - which requires ``save_directory`` to be a local clone of the repo you are pushing to if it's an existing - folder. Pass along ``temp_dir=True`` to use a temporary directory instead. - - kwargs: - Additional key word arguments passed along to the :py:meth:`~file_utils.PushToHubMixin.push_to_hub` method. - - .. note:: - - You may need to install git-lfs on your machine. - - .. code-block:: bash - - wget -P ~ https://github.com/git-lfs/git-lfs/releases/download/v3.0.2/git-lfs-linux-amd64-v3.0.2.tar.gz - cd ~ - tar -xvzf git-lfs-linux-amd64-v3.0.2.tar.gz - export PATH=~:$PATH - git-lfs install + finetuned_delta_path: (optional) path to the directory where the model and its configuration file will be saved. + If not specified, the model will be saved in the directory ``./delta_checkpoints/``, + which is a subdirectory of the current working directory. + save_config: (optional) if ``True``, the configuration file will be saved in the same directory as the + model file. if ``False``, only the state dict will be saved. + state_dict: (optional) a dictionary containing the model's state_dict. If not specified, the + state_dict is loaded from the backbone model's trainable parameters. + save_function: (optional) the function used to save the model. Defaults to ``torch.save``. + state_dict_only: (optional) if ``True``, only the state_dict will be saved. + push_to_dc: (optional) if ``True``, the model will prepare things to pushed to the DeltaCenter. + This includes: + - creating a configuration file for the model + - creating a directory for the model + - saving the model's trainable parameters + - pushing the model to the DeltaCenter + center_args: (optional) the arguments that are used to distinguish between different delta models on the DeltaCenter + center_args_pool: (optional) a dictionary containing the arguments that are used to distinguish between different delta models on the DeltaCenter + list_tags: (optional) a list of tags that will be added to the model's configuration file + dict_tags: (optional) a dictionary of tags that will be added to the model's configuration file + delay_push: (optional) if ``True``, the model will not be pushed to the DeltaCenter. This is useful if you want to + push the model later. """ + + # create the config to save, including model hash, etc. + if save_config: + if not hasattr(self, "config"): + self.create_config_from_model() + self.add_configs_when_saving() + + if push_to_dc: + final_center_args = self.create_delta_center_args(center_args=center_args, + center_args_pool=center_args_pool) + + save_directory = finetuned_delta_path if os.path.isfile(save_directory): logger.error(f"Provided path ({save_directory}) should be a directory, not a file") return - if push_to_hub: - commit_message = kwargs.pop("commit_message", None) - repo = self._create_or_get_repo(save_directory, **kwargs) - os.makedirs(save_directory, exist_ok=True) - # Only save the model itself if we are using distributed training - + if push_to_dc: + save_directory = os.path.join(save_directory, final_center_args.name) + os.makedirs(save_directory, exist_ok=True) + model_to_save = self.backbone_model# unwrap_model(self) # Save the model if state_dict is None: state_dict = model_to_save.state_dict() - - # Save the config - if save_config: - if not hasattr(self, "config"): - self.create_config_from_model() - self.add_configs_when_saving() - self.config.save_finetuned(save_directory) - # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(save_directory, WEIGHTS_NAME) save_function(state_dict, output_model_file) - logger.info(f"Model weights saved in {output_model_file}") + # Save the config + if save_config: + self.config.save_finetuned(save_directory) + - if push_to_hub: - url = self._push_to_hub(repo, commit_message=commit_message) - logger.info(f"Model pushed to the hub in this commit: {url}") + + + + + logger.info("\n"+"*"*30+f"\nYou delta models has been saved locally to:\t{os.path.abspath(save_directory)}" + ) + self.compute_saving(output_model_file) + + state_dict_total_params = sum(p.numel() for p in state_dict.values()) + other_tags={} + other_tags.update({'state_dict_total_params(M)':state_dict_total_params/1024/1024}) + other_tags.update({'test_result':test_result}) + if push_to_dc: + logger.info("Creating yaml file for delta center") + self.create_yml(save_directory, final_center_args, list_tags, dict_tags, other_tags) + + if not delay_push: + OssClient.upload(base_dir=save_directory) + else: + logger.info(f"Delay push: you can push it to the delta center later using \n\tpython -m DeltaCenter upload {os.path.abspath(save_directory)}\n" + +"*"*30) + else: + logger.info("We encourage users to push their final and public models to delta center to share them with the community!") + + def compute_saving(self, output_model_file): + import os + stats = os.stat(output_model_file) + if stats.st_size > (1024**3): + unit = 'GB' + value = stats.st_size/(1024**3) + else: + unit = 'MB' + value = stats.st_size/(1024**2) + logger.info("The state dict size is {:.3f} {}".format(value, unit)) + + + + + def create_yml(self, save_dir, config, list_tags=list(), dict_tags=dict(),other_tags=None): + f = open("{}/config.yml".format(save_dir), 'w') + config_dict = vars(config) + config_dict['dict_tags'] = dict_tags + config_dict['list_tags'] = list_tags + if other_tags is not None: + config_dict.update(other_tags) + yaml.safe_dump(config_dict, f) + f.close() + + def load_checkpoint(self, path, load_func=torch.load, backbone_model=None): + r"""Simple method for loading only the checkpoint + """ + if backbone_model is None: + backbone_model = self.backbone_model + self.backbone_model.load_state_dict(load_func(f"{path}/{WEIGHTS_NAME}"), strict=False) + + def save_checkpoint(self, path, save_func=torch.save, backbone_model=None): + r"""Simple method for saving only the checkpoint""" + if backbone_model is None: + backbone_model = self.backbone_model + save_func(backbone_model.state_dict(), f"{path}/{WEIGHTS_NAME}") @classmethod - def from_finetuned(cls, - finetuned_model_name_or_path: Optional[Union[str, os.PathLike]], - backbone_model: nn.Module, - *model_args, - check_hash: Optional[bool] = True, - **kwargs): + def from_finetuned(cls, + finetuned_delta_path: Optional[Union[str, os.PathLike]], + backbone_model: nn.Module, + delta_config = None, + cache_dir: Optional[Union[str, os.PathLike]] = None, + state_dict: Optional[dict] = None, + *model_args, + force_download: Optional[bool] = False, + check_hash: Optional[bool] = True, + local_files_only: Optional[bool] = False, + **kwargs): r""" Instantiate a finetuned delta model from a path. - The backbone_model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated). + The backbone_model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated). To further train the model, you can use the :meth:`freeze_module ` method. Parameters: - - finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, *optional*): - Can be either: - - - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. - Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a - user or organization name, like ``dbmdz/bert-base-german-cased``. - - A path to a *directory* containing model weights saved using - :meth:`SaveLoadMixin.save_finetuned`, e.g., ``./my_model_directory/``. - - A path or url to a *tensorflow index checkpoint file* (e.g, ``./tf_model/model.ckpt.index``). In - this case, ``from_tf`` should be set to ``True`` and a configuration object should be provided as - ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a - PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. - - A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g, - ``./flax_model/`` containing ``flax_model.msgpack``). In this case, ``from_flax`` should be set to - ``True``. - - ``None`` if you are both providing the configuration and state dictionary (resp. with keyword - arguments ``config`` and ``state_dict``). - backbone_model (:obj:`torch.nn.Module`): The backbone model to be modified. - model_args (sequence of positional arguments, *optional*): - All remaining positional arguments will be passed to the underlying model's ``__init__`` method. - config (Union[:obj:`BaseDeltaConfig`, :obj:`str`, :obj:`os.PathLike`], *optional*): Can be either: - - an instance of a class derived from :class:`~PretrainedConfig`, - - a string or path valid as input to :py:meth:`~PretrainedConfig.from_pretrained`. - - Configuration for the model to use instead of an automatically loaded configuration. Configuration can - be automatically loaded when: - - - The model is a model provided by the library (loaded with the *model id* string of a pretrained - model). - - The model was saved using :py:meth:`~PreTrainedModel.save_pretrained` and is reloaded by supplying the - save directory. - - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a - configuration JSON file named *config.json* is found in the directory. - state_dict (Dict[:obj:`str`, :obj:`torch.Tensor`], *optional*): - A state dictionary to use instead of a state dictionary loaded from saved weights file. - This option can be used if you want to create a model from a pretrained configuration but load your own - weights. In this case though, you should check if using :py:meth:`~PreTrainedModel.save_pretrained` and - :py:meth:`~PreTrainedModel.from_pretrained` is not a simpler option. - cache_dir (:obj:`Union[str, os.PathLike]`, *optional*): - Path to a directory in which a downloaded pretrained model configuration should be cached if the - standard cache should not be used. - force_download (:obj:`bool`, *optional*, defaults to :obj:`False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - resume_download (:obj:`bool`, *optional*, defaults to :obj:`False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. - proxies (:obj:`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only(:obj:`bool`, *optional*, defaults to :obj:`False`): - Whether or not to only look at local files (i.e., do not try to download the model). - use_auth_token (:obj:`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token generated - when running ``transformers-cli login`` (stored in ``~/.huggingface``). - revision(:obj:`str`, *optional*, defaults to ``"main"``): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any - identifier allowed by git. - mirror(:obj:`str`, *optional*): - Mirror source to accelerate downloads in China. If you are from China and have an accessibility - problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. - Please refer to the mirror site for more information. - torch_dtype (:obj:`str` or :obj:`torch.dtype`, *optional*): - Override the default :obj:`torch.dtype` and load the model under this dtype. If ``"auto"`` is passed the dtype - will be automatically derived from the model's weights. - - .. warning:: - - This feature is inherited from HuggingFace. We do not guarantee its usefulness currently. - One should only disable *_fast_init* to ensure backwards compatibility with `transformers.__version__ < - 4.6.0` for seeded model initialization. This argument will be removed at the next major version. See - `pull request 11471 `_ for more information. - kwargs (remaining dictionary of keyword arguments, *optional*): - Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., - ``output_attentions=True``). Behaves differently depending on whether a ``config`` is provided or - automatically loaded: - - - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the - underlying model's ``__init__`` method (we assume all relevant updates to the configuration have - already been done) - - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class - initialization function (:py:meth:`~PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that - corresponds to a configuration attribute will be used to override said attribute with the - supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute - will be passed to the underlying model's ``__init__`` function. - - .. tip:: - Passing ``use_auth_token=True`` is required when you want to use a private model. - - .. code-block:: python - - from transformers import AutoModelForSeq2SeqLM - t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-base") - from opendelta import AutoDeltaModel - delta = AutoDeltaModel.from_finetuned("DeltaHub/lora_t5-base_mrpc", backbone_model=t5) - delta.log() - - - + finetuned_delta_path: (optional) path to the directory where the model and its configuration file will be saved. + If not specified, the model will be loaded from the directory cahce directory. (see ``cache_dir``), + backbone_model: the backbone model that will be used to instantiate the finetuned delta model. + delta_config: (optional) the configuration file of the finetuned delta model. If not specified, the configuration file + is loaded from the directory ``finetuned_delta_path``. + cache_dir: (optional) path to the directory where the model and its configuration file will be saved. + If not specified, we will first look into current working directory, then the cache directory of your system, e.g., ~/.cache/delta_center/, + state_dict: (optional) a dictionary containing the model's state_dict. If not specified, the + state_dict is loaded from the ``finetuned_delta_path``. + force_download: (optional) if ``True``, the model will be downloaded from the internet even if it is already + present in the cache directory. + check_hash: (optional) if ``True``, check whether the hash of the model once it's trained differs from what we load now. + local_files_only: (optional) if ``True``, the model will be loaded from the local cache directory. """ - config = kwargs.pop("config", None) - state_dict = kwargs.pop("state_dict", None) - cache_dir = kwargs.pop("cache_dir", None) - - # ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False) - force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) - proxies = kwargs.pop("proxies", None) - # output_loading_info = kwargs.pop("output_loading_info", False) - local_files_only = kwargs.pop("local_files_only", False) - use_auth_token = kwargs.pop("use_auth_token", None) - revision = kwargs.pop("revision", None) - mirror = kwargs.pop("mirror", None) - from_pipeline = kwargs.pop("_from_pipeline", None) - from_auto_class = kwargs.pop("_from_auto", False) - # _fast_init = kwargs.pop("_fast_init", True) - torch_dtype = kwargs.pop("torch_dtype", None) - # low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False) - user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class} - - if is_offline_mode() and not local_files_only: - logger.info("Offline mode: forcing local_files_only=True") + if os.environ.get("DELTACENTER_OFFLINE", '0') == '1': + logger.info("Delta Center offline mode!") local_files_only = True # Load config if we don't provide a configuration - if not isinstance(config, BaseDeltaConfig): - config_path = config if config is not None else finetuned_model_name_or_path - config, model_kwargs = cls.config_class.from_finetuned( - config_path, - cache_dir=cache_dir, + + + finetuned_delta_path = str(finetuned_delta_path) + + if cache_dir is not None: + cached_finetuned_delta_path = os.path.join(cache_dir, finetuned_delta_path) + else: + cached_finetuned_delta_path = finetuned_delta_path + + download_from_dc = False + if os.path.isfile(cached_finetuned_delta_path): + raise RuntimeError( + f"You should pass a directory to load a delta checkpoint instead of a file, " + f"since we need the delta's configuration file." + ) + elif os.path.isdir(cached_finetuned_delta_path): + if os.path.isfile(os.path.join(cached_finetuned_delta_path, WEIGHTS_NAME)): + # Load from a PyTorch checkpoint + weight_file = os.path.join(cached_finetuned_delta_path, WEIGHTS_NAME) + else: + raise EnvironmentError( + f"Error no file named {WEIGHTS_NAME} found in " + f"directory {cached_finetuned_delta_path}." + ) + + else: + # try to download from DeltaCenter + from .delta_center import download as dcdownload + cached_finetuned_delta_path = dcdownload(finetuned_delta_path, cache_dir=cache_dir, force_download=force_download) + download_from_dc = True + weight_file = os.path.join(cached_finetuned_delta_path, WEIGHTS_NAME) + + if state_dict is None: + state_dict = torch.load(weight_file, map_location="cpu") + + if not isinstance(delta_config, BaseDeltaConfig): + delta_config, model_kwargs = cls.config_class.from_finetuned( + cached_finetuned_delta_path, + cache_dir=None, return_unused_kwargs=True, - force_download=force_download, - resume_download=resume_download, - proxies=proxies, - local_files_only=local_files_only, - use_auth_token=use_auth_token, - revision=revision, - _from_auto=from_auto_class, - _from_pipeline=from_pipeline, + local_files_only=True if download_from_dc else local_files_only, # has been downloaded **kwargs, ) else: model_kwargs = kwargs - # Load model - if finetuned_model_name_or_path is not None: - finetuned_model_name_or_path = str(finetuned_model_name_or_path) - if os.path.isdir(finetuned_model_name_or_path): - if os.path.isfile(os.path.join(finetuned_model_name_or_path, WEIGHTS_NAME)): - # Load from a PyTorch checkpoint - archive_file = os.path.join(finetuned_model_name_or_path, WEIGHTS_NAME) - else: - raise EnvironmentError( - f"Error no file named {WEIGHTS_NAME} found in " - f"directory {finetuned_model_name_or_path}." - ) - elif os.path.isfile(finetuned_model_name_or_path) or is_remote_url(finetuned_model_name_or_path): - archive_file = finetuned_model_name_or_path - else: - archive_file = hf_bucket_url( - finetuned_model_name_or_path, - filename=WEIGHTS_NAME, - revision=revision, - mirror=mirror, - ) - try: - # Load from URL or cache if already cached #TODO + # Initialize the model from config and attach the delta model to the backbone_model. + delta_model = cls.from_config(delta_config, backbone_model, *model_args, **model_kwargs, ) - resolved_archive_file = cached_path( - archive_file, - cache_dir=cache_dir, - force_download=force_download, - proxies=proxies, - resume_download=resume_download, - local_files_only=local_files_only, - use_auth_token=use_auth_token, - user_agent=user_agent, - ) - except EnvironmentError as err: - logger.error(err) - msg = ( - f"Can't load weights for '{finetuned_model_name_or_path}'. Make sure that:\n\n" - ) - - if revision is not None: - msg += f"- or '{revision}' is a valid git identifier (branch name, a tag name, or a commit id) that exists for this model name as listed on its model page on 'https://huggingface.co/models'\n\n" - - raise EnvironmentError(msg) - - if resolved_archive_file == archive_file: - logger.info(f"loading weights file {archive_file}") - else: - logger.info(f"loading weights file {archive_file} from cache at {resolved_archive_file}") - else: - resolved_archive_file = None - - # load pt weights early so that we know which dtype to init the model under - - if state_dict is None: - try: - state_dict = torch.load(resolved_archive_file, map_location="cpu") - except Exception as e: - try: - with open(resolved_archive_file) as f: - if f.read().startswith("version"): - raise OSError( - "You seem to have cloned a repository without having git-lfs installed. Please install " - "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder " - "you cloned." - ) - else: - raise ValueError from e - except (UnicodeDecodeError, ValueError): - raise OSError( - f"Unable to load weights from pytorch checkpoint file for '{finetuned_model_name_or_path}' " - f"at '{resolved_archive_file}'. " - "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True." - ) - - # set dtype to instantiate the model under: - # 1. If torch_dtype is not None, we use that dtype - # 2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first - # weights entry - we assume all weights are of the same dtype - # we also may have config.torch_dtype available, but we won't rely on it till v5 - dtype_orig = None - if torch_dtype is not None: - if isinstance(torch_dtype, str): - if torch_dtype == "auto": - torch_dtype = next(iter(state_dict.values())).dtype - else: - raise ValueError( - f"`torch_dtype` can be either a `torch.dtype` or `auto`, but received {torch_dtype}" - ) - dtype_orig = cls._set_default_torch_dtype(torch_dtype) - - - # Initialize the model from config and attach the delta model to the backbone_model. - delta_model = cls.from_config(config, backbone_model, *model_args, **model_kwargs, ) - - # load the state_dict into the backbone_model. As the delta model's parameter + # load the state_dict into the backbone_model. As the delta model's parameter # is the same object as the deltas in the backbone model with different reference name, # the state_dict will also be loaded into the delta model. delta_model._load_state_dict_into_backbone(backbone_model, state_dict) backbone_hash = gen_model_hash(backbone_model) - if check_hash and hasattr(config, "backbone_hash") and \ - config.backbone_hash is not None and \ - config.backbone_hash != backbone_hash: - logger.warning("The config has an hash of the backbone model, and is" - "different from the hash of the loaded model. This indicates a mismatch" - "between the backbone model that the delta checkpoint is based on and" - "the one you loaded. You propobability need to Train the model instead of" - "directly inference. ") + + if check_hash: + if hasattr(delta_config, "backbone_hash") and \ + delta_config.backbone_hash is not None and \ + delta_config.backbone_hash != backbone_hash: + logger.warning("The config has an hash of the backbone model, and is" + "different from the hash of the loaded model. This indicates a mismatch" + "between the backbone model that the delta checkpoint is based on and" + "the one you loaded. You propobability need to Train the model instead of" + "directly inference. ") + else: + logger.info("Hash-check passed. You can safely use this checkpoint directly.") + else: + logger.warning("Parameters' hash has not been checked!") + # Set model in evaluation mode to deactivate DropOut modules by default backbone_model.eval() return delta_model - + + + def create_delta_center_args(self, center_args, center_args_pool): + """ + Create the delta center args for the center model. + center_args has higher priority than center_args_pool. + + """ + mdict = {} + field = fields(DeltaCenterArguments) + + + for f in field: + exist = False + # first is center_args, exact match + if f.name in center_args: + mdict[f.name] = center_args[f.name] + continue + # second is center_args_pool, can use alternative names + if f.name in center_args_pool: + mdict[f.name] = center_args_pool[f.name] + exist = True + elif f.name in alternative_names: + for altername in alternative_names[f.name]: + if altername in center_args_pool: + mdict[f.name] = center_args_pool[altername] + exist = True + break + # if not exist, find from self.stat or set to default + if not exist: + if f.name in self.stat: + mdict[f.name] = self.stat[f.name] + else: + mdict[f.name] = f.default + + # if eventualy name is not set, create a default one + if mdict['name'] is None or mdict['name'] == '': + logger.info("Name is not set, use default name.") + mdict['name'] = self.create_default_name(**mdict) + + if len(mdict['usage']) == 0: + logger.info("Usage is not set, use default usage.") + mdict['usage'] = self.create_default_usage(mdict['name']) + + + center_args = DeltaCenterArguments(**mdict) + return center_args + + def create_default_usage(self, name): + usage_str = """from opendelta import AutoDeltaModel\n""" + \ + """delta_model = AutoDeltaModel.from_finetuned('{name_with_userid}', backbone_model=model)\n""" + \ + """delta_model.freeze_module() # if you are going to further train it \n""" + \ + """delta_model.log()""" + return usage_str + + def create_default_name(self, **kwargs): + r"""Currently, it's only a simple concatenation of the arguments. + """ + + reponame = "" + reponame += kwargs["backbone_model_path_public"].split("/")[-1]+"_" if kwargs['backbone_model_path_public'] is not None else kwargs['backbone_model'] + reponame += kwargs["delta_type"]+"_" if kwargs["delta_type"] is not None else "" + + # tasks + if isinstance(kwargs["train_tasks"], list): + train_tasks = "+".join(kwargs["train_tasks"]) + elif kwargs["train_tasks"] is not None: + train_tasks = kwargs["train_tasks"] + else: + logger.warning("train_tasks are not find in all arguments. Do you miss them?") + train_tasks = None + reponame += train_tasks+"_" if train_tasks is not None else "" + + # time + reponame += datetime.datetime.now().strftime("%Y%m%d%H%M%S") #+ gen_model_hash(model=self.backbone_model) + + # model hash + if hasattr(self.config, "backbone_hash"): + reponame += self.config.backbone_hash[:3] + return reponame + diff --git a/opendelta/utils/signature.py b/opendelta/utils/signature.py index b559f92..41aa95e 100644 --- a/opendelta/utils/signature.py +++ b/opendelta/utils/signature.py @@ -4,10 +4,10 @@ from collections import namedtuple def signature(f): r"""Get the function f 's input arguments. A useful gadget when some function slot might be instantiated into multiple functions. - + Args: f (:obj:`function`) : the function to get the input arguments. - + Returns: namedtuple : of args, default, varargs, keywords, respectively.s @@ -34,7 +34,7 @@ def signature(f): ] or None argspec = namedtuple('Signature', ['args', 'defaults', 'varargs', 'keywords']) - return argspec(args, defaults, varargs, keywords) + return argspec(args, defaults, varargs, keywords) def get_arg_names(f): r""" Get a functions argument name, remove the ``self`` argument @@ -45,6 +45,7 @@ def get_arg_names(f): return args + def get_arg_names_inside_func(func): r""" Get the functions argument name inside the function itself. Remove ``self`` argument. """ diff --git a/opendelta/utils/structure_mapping.py b/opendelta/utils/structure_mapping.py index 4cdc507..ea4b7ae 100644 --- a/opendelta/utils/structure_mapping.py +++ b/opendelta/utils/structure_mapping.py @@ -3,209 +3,20 @@ import copy import opendelta.utils.logging as logging from opendelta.utils.visualization import Visualization logger = logging.get_logger(__name__) -t5_mapping = { - "shared": {"__name__":"embeddings"}, - "encoder": {"__name__":"encoder", - "embed_tokens": {"__name__":"embeddings"}, - "block": {"__name__":"block", - "$": {"__name__":"$", - "layer.0": {"__name__":"attn", - "SelfAttention.q": {"__name__":"q"}, - "SelfAttention.k": {"__name__":"k"}, - "SelfAttention.v": {"__name__":"v"}, - "SelfAttention.o": {"__name__":"proj"}, - "SelfAttention.relative_attention_bias": {"__name__":""}, - "layer_norm": {"__name__":"layer_norm"}, - }, - "layer.1": {"__name__":"ff", - "DenseReluDense.wi": {"__name__":"w1"}, - "layer_norm": {"__name__":"layer_norm"}, - "DenseReluDense.wo": {"__name__":"w2"}, - } - } - }, - "final_layer_norm": {"__name__":"layer_norm"}, - }, - "decoder": {"__name__":"decoder", - "embed_tokens": {"__name__":"embeddings"}, - "block": {"__name__":"block", - "$": {"__name__":"$", - "layer.0": {"__name__":"attn", - "SelfAttention.q": {"__name__":"q"}, - "SelfAttention.k": {"__name__":"k"}, - "SelfAttention.v": {"__name__":"v"}, - "SelfAttention.o": {"__name__":"proj"}, - "SelfAttention.relative_attention_bias": {"__name__":""}, - "layer_norm": {"__name__":"layer_norm"}, - }, - "layer.1": {"__name__":"crossattn", - "EncDecAttention.q": {"__name__":"q"}, - "EncDecAttention.k": {"__name__":"k"}, - "EncDecAttention.v": {"__name__":"v"}, - "EncDecAttention.o": {"__name__":"proj"}, - "layer_norm": {"__name__":"layer_norm"}, - }, - "layer.2": {"__name__":"ff", - "DenseReluDense.wi": {"__name__":"w1"}, - "layer_norm": {"__name__":"layer_norm"}, - "DenseReluDense.wo": {"__name__":"w2"}, - } - } - }, - "final_layer_norm": {"__name__":"layer_norm"}, - } -} -roberta_mapping = { - "roberta.embeddings.word_embeddings": {"__name__":"embeddings"}, - "roberta.embeddings.position_embeddings": {"__name__":""}, - "roberta.embeddings.token_type_embeddings": {"__name__":""}, - "roberta.embeddings.LayerNorm": {"__name__":""}, - "roberta.encoder": {"__name__":"encoder", - "layer": {"__name__":"block", - "$": {"__name__":"$", - "attention": {"__name__":"attn", - "self.query": {"__name__":"q"}, - "self.key": {"__name__":"k"}, - "self.value": {"__name__":"v"}, - "output.dense": {"__name__":"proj"}, - "output.LayerNorm": {"__name__":"layer_norm"}, - }, - "output": {"__name__":"ff", - "dense": {"__name__":"w2"}, - "LayerNorm": {"__name__":"layer_norm"} - }, - "intermediate.dense": {"__name__":"ff.w1"}, - } - } - }, - "lm_head": {"__name__":"lm_head", - "dense": {"__name__":""}, - "layer_norm": {"__name__":""}, - "decoder": {"__name__":"proj"}, - }, -} +from opendelta.utils.common_structures import CoreMappings +MAPPINGERROR_MSG = f"Available Models with default configurations are {list(CoreMappings.keys())} . Please manually add the delta models by speicifying 'modified_modules' based on the visualization of your model structure. Refer to `https://opendelta.readthedocs.io/en/latest/notes/faq.html` for detail." -bert_mapping = { - "bert.embeddings.word_embeddings": {"__name__":"embeddings"}, - "bert.embeddings.position_embeddings": {"__name__":""}, - "bert.embeddings.token_type_embeddings": {"__name__":""}, - "bert.embeddings.LayerNorm": {"__name__":""}, - "bert.encoder": {"__name__":"encoder", - "layer": {"__name__":"block", - "$": {"__name__":"$", - "attention": {"__name__":"attn", - "self.query": {"__name__":"q"}, - "self.key": {"__name__":"k"}, - "self.value": {"__name__":"v"}, - "output.dense": {"__name__":"proj"}, - "output.LayerNorm": {"__name__":"layer_norm"}, - }, - "output": {"__name__":"ff", - "dense": {"__name__":"w2"}, - "LayerNorm": {"__name__":"layer_norm"} - }, - "intermediate.dense": {"__name__":"ff.w1"}, - } - } - }, - "cls.predictions": {"__name__": "lm_head", - "transform.dense": {"__name__":""}, - "transform.LayerNorm": {"__name__":""}, - "decoder": {"__name__":"proj"}, - } -} - -debertav2_mapping = { - "deberta.embeddings.word_embeddings": {"__name__":"embeddings"}, - "deberta.embeddings.LayerNorm": {"__name__":""}, - "deberta.encoder": {"__name__":"encoder", - "layer": {"__name__":"block", - "$": {"__name__":"$", - "attention": {"__name__":"attn", - "self.query_proj": {"__name__":"q"}, - "self.key_proj": {"__name__":"k"}, - "self.value_proj": {"__name__":"v"}, - "output.dense": {"__name__":"proj"}, - "output.LayerNorm": {"__name__":"layer_norm"}, - }, - "output": {"__name__":"ff", - "dense": {"__name__":"w2"}, - "LayerNorm": {"__name__":"layer_norm"} - }, - "intermediate.dense": {"__name__":"ff.w1"}, - } - }, - "rel_embeddings": {"__name__": ""}, - "LayerNorm": {"__name__": ""}, - "conv": {"__name__": "", - "conv": {"__name__": ""}, - "LayerNorm": {"__name__": ""} - } - }, - "lm_predictions.lm_head": {"__name__":"lm_head", - "dense": {"__name__":""}, - "LayerNorm": {"__name__":""}, - "bias": {"__name__": ""} - }, -} - -gpt2_mapping = { - "transformer.wte": {"__name__":"embeddings"}, - "transformer.wpe": {"__name__":""}, - "transformer.h": {"__name__":"decoder.block", - "$": {"__name__":"$", - "attn": {"__name__":"attn", - "c_attn": {"__name__":"q,k,v"}, - "c_proj": {"__name__":"proj"}, - }, - "ln_1": {"__name__":"attn.layer_norm"}, - "mlp":{ "__name__": "ff", - "c_fc": {"__name__":"w1"}, - "c_proj": {"__name__":"w2"} - }, - "ln_2": {"__name__":"ff.layer_norm"}, - }, - }, - "transformer.ln_f": {"__name__":"decoder.layernorm"}, - "lm_head": {"__name__":"lm_head.proj"}, -} - -distilbert_mapping = { - "distilbert.embeddings.word_embeddings": {"__name__":"embeddings"}, - "distilbert.embeddings.position_embeddings": {"__name__":""}, - "distilbert.embeddings.token_type_embeddings": {"__name__":""}, - "distilbert.embeddings.LayerNorm": {"__name__":""}, - "distilbert.transformer": {"__name__":"encoder", - "layer": {"__name__":"block", - "$": {"__name__":"$", - "attention": {"__name__":"attn", - "q_lin": {"__name__":"q"}, - "k_lin": {"__name__":"k"}, - "v_lin": {"__name__":"v"}, - "out_lin": {"__name__":"proj"}, - }, - "ffn": {"__name__":"ff", - "lin1": {"__name__":"w1"}, - "lin2": {"__name__":"w2"}, - }, - "sa_layer_norm": {"__name__":"attn.layer_norm"}, - "output_layer_norm":{"__name__": "ff.layer_norm"} - } - } - } -} - def transform(org_key, mapping, strict=True, warning=False, verbose=False): - chain = org_key.split(".") query = "" node = mapping new_chain = [] + virtual_key, virtual_chain, in_virtual_order = None, None, None for elem in chain: query += elem if query in node: @@ -215,135 +26,93 @@ def transform(org_key, mapping, strict=True, warning=False, verbose=False): if strict: if warning: print(f"'{org_key}' has no common mapping.") - return + return else: new_chain.append(query) else: - new_chain.append(new_elem) + splited_new_elem = new_elem.split(".") + splited_new_elem = [e+"@" for e in splited_new_elem] + special_token = '.'.join(splited_new_elem) + if '__virtual__' in node: + virtual_chain = copy.deepcopy(new_chain) + virtual_chain.append(".".join([e+'@' for e in node["__virtual__"].split(".")])) + in_virtual_order = node['__order__'] + new_chain.append(special_token) # special token for transformed key + + query = "" elif "$" in node: node = node["$"] new_chain.append(query) query = "" else: - query += "." + query += "." if query!="": if strict: if warning: print("A part of the orginial key hasn't been matched!") - return + return else: new_chain.append(query.strip(".")) # tailing query + new_key = ".".join(new_chain) if verbose: print(f"{org_key} => {new_key}") - return new_key - + if virtual_chain is not None: + virtual_key = ".".join(virtual_chain) + return new_key, virtual_key, in_virtual_order -def mapping_for_SequenceClassification(mapping, type): - mapping = copy.deepcopy(mapping) - if type == "roberta": - mapping.pop("lm_head") - mapping['classifier'] = {"__name__":"classifier", - "dense": {"__name__": "dense"}, - "out_proj": {"__name__":"out_proj"} - } - elif type == "bert": - mapping.pop("cls.predictions") - mapping["classifier"] = {"__name__": "classifier"} - elif type == "deberta": - mapping.pop("lm_predictions.lm_head") - mapping["pooler"] = {"__name__": "classifier"} - mapping["classifier"] = {"__name__": "classifier"} - else: - raise NotImplementedError - return mapping - -def mapping_for_ConditionalGeneration(mapping, type): - mapping = copy.deepcopy(mapping) - if type == "t5": - mapping["lm_head"] = {"__name__":"lm_head.proj"} - else: - raise NotImplementedError - return mapping - -class _LazyLoading(OrderedDict): - def __init__(self, mapping): - self._mapping_string = mapping - self._mapping = {} - - def __getitem__(self, key): - if key not in self._mapping_string: - raise KeyError(key) - value = self._mapping_string[key] - self._mapping[key] = eval(value) - return self._mapping[key] - - def keys(self): - return list(self._mapping_string.keys()) - - def __contains__(self, item): - - return item in self._mapping_string - class CommonStructureMap(object): - r""" A lazy loading structure map. + r""" A loading structure map. """ - Mappings = _LazyLoading({ - "RobertaForSequenceClassification": """mapping_for_SequenceClassification(roberta_mapping, "roberta")""", - "RobertaForMaskedLM": "roberta_mapping", - "BertForMaskedLM": "bert_mapping", - "BertForSequenceClassification": """mapping_for_SequenceClassification(bert_mapping, "bert")""", - "T5ForConditionalGeneration": """mapping_for_ConditionalGeneration(t5_mapping, "t5")""", - "DebertaV2ForSequenceClassification": """mapping_for_SequenceClassification(debertav2_mapping, "deberta")""" - }) + + New_Mappings = CoreMappings SpecialModelInverseMaps = { } - def __init__(self, mapping): - if not isinstance(mapping, dict): - raise TypeError(f"Initial a {CommonStructureMap.__name__} using a non-dict object. Consider using `load` instead.") - self.mapping = mapping + def __init__(self, backbone_model, strict=True, warning=False, visualize=True): + self.matched_pairs = {} + self.find_sub_common_structure(backbone_model, matched_pairs=self.matched_pairs) + if len(self.matched_pairs) == 0: + raise KeyError(MAPPINGERROR_MSG) - @classmethod - def load(cls, backbone_model, strict=True, warining=False, visualize=True): - r"""Doc - """ - backbone_class = type(backbone_model).__name__ - if backbone_class not in cls.Mappings: - raise KeyError(backbone_class) - mapping = cls.Mappings[backbone_class] - if visualize: - logger.info("Since you are using the common structure mapping, draw the transformed parameter structure for checking.") - vis = Visualization(backbone_model) - vis.structure_graph(common_structure=True, mapping=mapping) - return cls(mapping) - def __repr__(self,): return self.mapping - def transform(self, org_key, strict=True, warning=False): - return transform(org_key, self.mapping, strict, warning) + r'''Transform a key in the original model to the name convention in common structure. + ''' + new_key = org_key + virtual_key, in_virtual_order = None, None + + for key in self.matched_pairs: + left, right = org_key[:len(key)], org_key[len(key):].strip(".") + if left == key and len(right) > 0: + transformed_key, virtual_key, in_virtual_order = transform(right, self.matched_pairs[key], strict, warning) + if len(left) > 0: + new_key = left + "." + transformed_key + else: + new_key = transformed_key + break + return new_key, virtual_key, in_virtual_order + + def find_sub_common_structure(self, module, prefix='',matched_pairs = []): + if module.__class__.__name__ in self.New_Mappings: + if self.New_Mappings[module.__class__.__name__]: + if callable(self.New_Mappings[module.__class__.__name__]): + mapping = self.New_Mappings[module.__class__.__name__](module) + else: + mapping = self.New_Mappings[module.__class__.__name__] + matched_pairs[prefix] = mapping + return + for name, m in module.named_children(): + new_prefix = '.'.join([prefix, name]) if prefix != '' else name + self.find_sub_common_structure(m, prefix=new_prefix, matched_pairs = matched_pairs) + -if __name__ == "__main__": - from openprompt.plms import load_plm - import argparse - parser = argparse.ArgumentParser("") - parser.add_argument("--model", type=str, default='t5-lm', help="We test both t5 and t5-lm in this scripts, the corresponding tokenizerwrapper will be automatically loaded.") - parser.add_argument("--model_name_or_path", default="t5-base-lm-adapt") - parser.add_argument("--cache_base", default='/home/hushengding/plm_cache/') - parser.add_argument("--keep_non_params", action="store_true") - parser.add_argument("--expand_params", action="store_true") - args = parser.parse_args() - plm, tokenizer, model_config, WrapperClass = load_plm(args.model, args.cache_base+args.model_name_or_path) - - for name, _ in plm.named_modules(): - transform(name, t5_mapping, strict=True, warning=False) - diff --git a/opendelta/utils/visualization.py b/opendelta/utils/visualization.py index 2feeec9..7149137 100644 --- a/opendelta/utils/visualization.py +++ b/opendelta/utils/visualization.py @@ -86,7 +86,6 @@ class Visualization(object): """ def __init__(self, plm: nn.Module): - self.plm = plm self.type_color = "green" self.param_color = "cyan" diff --git a/requirements.txt b/requirements.txt index a53c347..12251a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,8 +3,10 @@ transformers>=4.10.0 datasets==1.17.0 sentencepiece>=0.1.96 tqdm>=4.62.2 -# loralib decorator rich web.py gitpython +scipy # need? +sklearn # need? +delta_center_client==0.0.4 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..0642610 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,5 @@ +[easy_install] + +index_url = https://pypi.org/simple + +# index_url = https://pypi.tuna.tsinghua.edu.cn/simple \ No newline at end of file diff --git a/setup.py b/setup.py index 31e0d58..27c0313 100644 --- a/setup.py +++ b/setup.py @@ -3,24 +3,35 @@ import setuptools import os import os -def get_requirements(path): - print("path is :", path) - ret = [] - with open(os.path.join(path, "requirements.txt"), encoding="utf-8") as freq: - for line in freq.readlines(): - ret.append( line.strip() ) +requires = """torch>=1.8.0 +transformers>=4.10.0 +datasets==1.17.0 +sentencepiece>=0.1.96 +tqdm>=4.62.2 +decorator +rich +web.py +gitpython +scipy # need? +sklearn # need? +delta_center_client==0.0.4 +""" + +def get_requirements(): + ret = [x for x in requires.split("\n") if len(x)>0] + print("requirements:", ret) return ret -path = os.path.dirname(os.path.abspath(__file__)) -requires = get_requirements(path) -print(requires) + +# path = os.path.dirname(os.path.abspath(__file__)) +# requires = get_requirements(path) with open('README.md', 'r') as f: setuptools.setup( name = 'opendelta', - version = "0.1.0", + version = "0.3.1", description = "An open source framework for delta learning (parameter efficient learning).", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", @@ -30,10 +41,10 @@ with open('README.md', 'r') as f: url="https://github.com/thunlp/OpenDelta", keywords = ['PLM', 'Parameter-efficient-Learning', 'AI', 'NLP'], python_requires=">=3.6.0", - install_requires=requires, + install_requires=get_requirements(), package_dir={'opendelta':'opendelta'}, package_data= { - 'opendelta':["utils/interactive/templates/*.html"], + 'opendelta':["utils/interactive/templates/*.html", 'requirments.txt'], }, include_package_data=True, packages=setuptools.find_packages(),