diff --git a/OpenDelta-0.3.2/.gitignore b/OpenDelta-0.3.2/.gitignore new file mode 100644 index 0000000..411800c --- /dev/null +++ b/OpenDelta-0.3.2/.gitignore @@ -0,0 +1,71 @@ +data/ +**/__pycache__/ +logs/* +experiments/logs +!logs/.gitkeep +datasets/* +!datasets/*.sh +.vscode/ +*.egg-info/ +eggs/ +.eggs/ +*.egg +**.egg +build/ +_build/ +**/build/ +outputs/ +log.txt +**/DeltaHub/ +**/sfs_scripts/ +*beans/ +**/examples/*/configs/* +!examples/*/configs/config_gen.py +**/jupyter_notebook_examples/ +!examples/jupyter_notebook_examples/*.py +!examples/*/configs/*.py +**/outputs_search/**/*.bin +**/outputs_search/**/*.pt + + +*.db +**/nohup.out +**/examples/examples_bmtrain/BigModels/down_data +**/examples/examples_bmtrain/BMTrain_stable +**/examples/examples_bmtrain/BMPretrain +**/examples/examples_bmtrain/BigModels/BigModels/results +**/Delta_Memory/ +**/output/ +**/thunlp/ +**/saved_ckpts/ + + +DeltaCenter-Python-Client/ +backbone_structure +delta_checkpoints +gitop.sh +load_dataset_and_model.ipynb +load_model.py +scripts +t.py +t.sh +!examples/examples_prompt/configs/*/*.json +!examples/examples_prompt/configs/** +**/delta_checkpoints/ +**/outputs/ + +dist/ +dist/* + +**/unittest/** +!unittest/**.py +!unittest/**.sh +!unittest/**.md + +**/tutorial/** +!tutorial/**.py +!tutorial/**.sh +!tutorial/**.md + + + diff --git a/OpenDelta-0.3.2/.readthedocs.yaml b/OpenDelta-0.3.2/.readthedocs.yaml new file mode 100644 index 0000000..7f43aea --- /dev/null +++ b/OpenDelta-0.3.2/.readthedocs.yaml @@ -0,0 +1,29 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 1 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-20.04 + tools: + python: "3.9" + # You can also specify other tool versions: + # nodejs: "16" + # rust: "1.55" + # golang: "1.17" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + +# If using Sphinx, optionally build your docs in additional formats such as PDF +# formats: +# - pdf + +# Optionally declare the Python requirements required to build your docs +python: + install: + - requirements: docs/requirements.txt \ No newline at end of file diff --git a/OpenDelta-0.3.2/LICENSE b/OpenDelta-0.3.2/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/OpenDelta-0.3.2/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/OpenDelta-0.3.2/README.md b/OpenDelta-0.3.2/README.md new file mode 100644 index 0000000..ac4e020 --- /dev/null +++ b/OpenDelta-0.3.2/README.md @@ -0,0 +1,161 @@ +
+ + + + +**An Open-Source Framework for Paramter-Efficient Tuning (Delta Tuning).** + +------ + +

+ Overview • + Installation • + Basic Usage • + Docs • + Performance • + + +

+ +
+ +![version](https://img.shields.io/badge/version-0.3.2-blue) + + +## Overview + +OpenDelta is a toolkit for parameter-efficient tuning methods (we dub it as *delta tuning*), by which users could flexibly assign (or add) a small amount parameters to update while keeping the most paramters frozen. By using OpenDelta, users could easily implement prefix-tuning, adapters, Lora, or any other types of delta tuning with preferred PTMs. + +- The latest version of OpenDelta is tested on Python==3.8.13, PyTorch==1.12.1, transformers==4.22.2. Other versions are likely to be supported as well. If you encounter bugs when using your own package versions, please raise an issue, we will look into it as soon as possible. + +- **A demo of using Opendelta to modify the PLM (E.g., BART).** +![How PLM changes using Delta-tuning](docs/source/imgs/demo.gif) + +## News +- **2022.10.25** Release v0.3.2. Support [BMTrain]()! Improve docs. Add inspect utilities. +- **2022.10.14** Release v0.3.0. We make the usage of default configurations of each delta tuning methods (i.e., the position they are attached) more friendly! If a custom model has our supported models as submodules inside, the default configuration is also available. Other key changes can be seen in [Update Log](https://opendelta.readthedocs.io/en/latest/notes/update.html#version-0-3-0) +- **2022.10.10** Merge a long-developed branch v0.2.4 into the master branch. Key updates are (1) the an example unifying the delta tuning paradigm and the prompt-tuning paradigm; (2) and support for [Delta Center](https://www.openbmb.org/toolKits/deltacenter), whose webpage is still under construction. Details can be seen in [Update Log](https://opendelta.readthedocs.io/en/latest/notes/update.html#version-0-2-4) +- **2022.03.24** We notice several bugs in Soft Prompt Tuning and Prefix Tuning, mainly due to their need to customize attention ids, token_type_ids, we are fixing it! Currently, please use the other methods since they are stabler and better in performance. +- **2022.03.20** Add a [colab example](https://colab.research.google.com/drive/1uAhgAdc8Qr42UKYDlgUv0f7W1-gAFwGo?usp=sharing) to illustrate efficient training and space-saving multitask-serving. +- **2022.03.20** A new pip version released. +- **2022.02.16** Support [regular expression](https://opendelta.readthedocs.io/en/latest/notes/namebasedaddr.html#regexexpr) in named-based addressing. + +## Installation +1. create a virtualenv (optional) +```shell +conda create -n opendelta_env python=3.8 +conda activate opendelta_env +``` + +2 install the lastest version +```bash +pip install git+https://github.com/thunlp/OpenDelta.git +``` + +**or** install the lastest pip version (more stable) +```bash +pip install opendelta +``` +**or** build from source +```bash +git clone git@github.com:thunlp/OpenDelta.git +cd OpenDelta +python setup.py install +# python setup.py develop # if you want to do some modifications on the code for your research: + +``` + +## Must Try +The following codes and comments walk you through the key functionality of OpenDelta. It is also in [must_try.py](https://github.com/thunlp/OpenDelta/tree/main/examples/unittest/must_try.py) and [must_try.ipynb in colab](https://colab.research.google.com/drive/1Nbe9zxt8LGQnKmtvEs07IN_PznjNCyk4?usp=sharing). + +```python +# use tranformers as usual. +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer +t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large") +t5_tokenizer = AutoTokenizer.from_pretrained("t5-large") +# A running example +inputs_ids = t5_tokenizer.encode("Is Harry Poter wrtten by JKrowling", return_tensors="pt") +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> '? Is it Harry Potter?' + + +# use existing delta models +from opendelta import AutoDeltaModel, AutoDeltaConfig +# use existing delta models from DeltaCenter +delta = AutoDeltaModel.from_finetuned("thunlp/Spelling_Correction_T5_LRAdapter_demo", backbone_model=t5) +# freeze the whole backbone model except the delta models. +delta.freeze_module() +# visualize the change +delta.log() + + +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> Is Harry Potter written by JK Rowling? + + +# Now save merely the delta models, not the whole backbone model, to tmp/ +delta.save_finetuned(".tmp") +import os; os.listdir(".tmp") +# >>> The state dict size is 1.443 MB +# >>> We encourage users to push their final and public models to delta center to share them with the community! + + +# reload the model from local url and add it to pre-trained T5. +t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large") +delta1 = AutoDeltaModel.from_finetuned(".tmp", backbone_model=t5) +import shutil; shutil.rmtree(".tmp") # don't forget to remove the tmp files. +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> Is Harry Potter written by JK Rowling? + +# detach the delta models, the model returns to the unmodified status. +delta1.detach() +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> '? Is it Harry Potter?' + +# use default configuration for cunstomized wrapped models which have PLMs inside. This is a common need for users. +import torch.nn as nn +class WrappedModel(nn.Module): + def __init__(self, inner_model): + super().__init__() + self.inner = inner_model + def forward(self, *args, **kwargs): + return self.inner(*args, **kwargs) + +wrapped_model = WrappedModel(WrappedModel(t5)) + +# say we use LoRA +delta_config = AutoDeltaConfig.from_dict({"delta_type":"lora"}) +delta2 = AutoDeltaModel.from_config(delta_config, backbone_model=wrapped_model) +delta2.log() +# >>> root +# -- inner +# -- inner +# ... +# ... lora_A:[8,1024], lora_B:[1024,8] +delta2.detach() + +# use a not default configuration +# say we add lora to the last four layer of the decoder of t5, with lora rank=5 +delta_config3 = AutoDeltaConfig.from_dict({"delta_type":"lora", "modified_modules":["[r]decoder.*((20)|(21)|(22)|(23)).*DenseReluDense\.wi"], "lora_r":5}) +delta3 = AutoDeltaModel.from_config(delta_config3, backbone_model=wrapped_model) +delta3.log() + +``` + +## Verified Default Configurations + +- **You can try to use OpenDelta on *any* backbone models based on PyTorch.** +- However, with small chances that the interface of the submodules of the backbone model is not supported. Therefore we verified some commonly +used models that OpenDelta are sure to support. + +- We will keep testing more and more emerging models. + +- Pull requests are welcomed when you successfully apply OpenDelta on your own backbone model. + + + + + + + + diff --git a/OpenDelta-0.3.2/docs/Makefile b/OpenDelta-0.3.2/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/OpenDelta-0.3.2/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/OpenDelta-0.3.2/docs/make.bat b/OpenDelta-0.3.2/docs/make.bat new file mode 100644 index 0000000..061f32f --- /dev/null +++ b/OpenDelta-0.3.2/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/OpenDelta-0.3.2/docs/readme.md b/OpenDelta-0.3.2/docs/readme.md new file mode 100644 index 0000000..a9e75df --- /dev/null +++ b/OpenDelta-0.3.2/docs/readme.md @@ -0,0 +1,20 @@ +# OpenDelta Documentation + +To build this doc locally, please firstly install [sphinx](https://www.sphinx-doc.org/en/master/) packages. + +``` +pip install sphinx +pip install sphinx_rtd_theme +pip install sphinx_copybutton +pip install sphinx_toolbox +pip install myst_parser +``` + +Then install opendelta either from source, or from pip. After that, + +``` +cd docs +make html +``` + +Then open the generated `docs/build/html/index.html` in your local browser. \ No newline at end of file diff --git a/OpenDelta-0.3.2/docs/requirements.txt b/OpenDelta-0.3.2/docs/requirements.txt new file mode 100644 index 0000000..2e18d0b --- /dev/null +++ b/OpenDelta-0.3.2/docs/requirements.txt @@ -0,0 +1,17 @@ +sphinx_copybutton +sphinx_rtd_theme +sphinx_toolbox +myst_parser + +torch>=1.8.0 +transformers>=4.10.0 +datasets==1.17.0 +sentencepiece>=0.1.96 +tqdm>=4.62.2 +decorator +rich +web.py +gitpython +scipy # need? +sklearn # need? +delta_center_client==0.0.4 \ No newline at end of file diff --git a/OpenDelta-0.3.2/docs/source/_static/css/custom.css b/OpenDelta-0.3.2/docs/source/_static/css/custom.css new file mode 100644 index 0000000..9cfdbbe --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/_static/css/custom.css @@ -0,0 +1,268 @@ +/* a, */ +.wy-menu-vertical header, +.wy-menu-vertical p.caption, +.wy-nav-top .fa-bars, +.wy-menu-vertical a:hover, + +/* Colors and text decoration. + For example, :black:`text in black` or :blink:`text blinking` in rST. */ + + /* .black { + color: black; +} + +.gray { + color: gray; +} + +.grey { + color: gray; +} + +.silver { + color: silver; +} + +.white { + color: white; +} + +.maroon { + color: maroon; +} + +.red { + color: red; +} + +.magenta { + color: magenta; +} + +.fuchsia { + color: fuchsia; +} + +.pink { + color: pink; +} + +.orange { + color: rgba(218, 135, 12, 0.897); +} */ + +/* .string { + color: rgb(172, 51, 44); +} */ + +/* .yellow { + color: yellow; +} + +.lime { + color: lime; +} + +.green { + color: green; +} + +.olive { + color: olive; +} + +.teal { + color: teal; +} + +.cyan { + color: cyan; +} + +.aqua { + color: aqua; +} + +.blue { + color: blue; +} + +.navy { + color: navy; +} + +.purple { + color: purple; +} + +.under { + text-decoration: underline; +} + +.over { + text-decoration: overline; +} + +.blink { + text-decoration: blink; +} + +.line { + text-decoration: line-through; +} + +.strike { + text-decoration: line-through; +} + +.it { + font-style: italic; +} + +.ob { + font-style: oblique; +} + +.small { + font-size: small; +} + +.large { + font-size: large; +} + +.smallpar { + font-size: small; +} */ + +a:link { + color: rgb(141, 99, 224) +} + +a:visited { + color: rgb(141, 99, 224) +} + +a:hover { + color: rgb(147, 47, 218) +} +.rst-content code.literal +{ + color: rgb(172, 49, 42) !important; + /* #5360f0 */ +} + +.rst-content tt.literal +{ + color: #f06b53 !important; +} +/* #a153f0 */ +/* inspired by sphinx press theme */ +.wy-menu.wy-menu-vertical li.toctree-l1.current > a { + border-left: solid 15px rgb(150, 92, 232) !important; + text-indent: -15px; + border-top: none; + border-bottom: none; +} + +.wy-menu.wy-menu-vertical li.toctree-l1.current > ul { + border-left: solid 15px #ddcaf7 !important; +} +/* inspired by sphinx press theme */ + +.wy-nav-side { + color: unset !important; + background: unset !important; + border-right: solid 1px #ccc !important; +} + +.wy-side-nav-search, +.wy-nav-top, +.wy-menu-vertical li, +.wy-menu-vertical li a:hover, +.wy-menu-vertical li a +{ + background: unset !important; +} + +.wy-menu-vertical li.current a { + border-right: unset !important; +} + +.wy-side-nav-search div, +.wy-menu-vertical a { + color: #404040 !important; +} + +.wy-menu-vertical button.toctree-expand { + color: #333 !important; +} + +.wy-nav-content { + max-width: unset; +} + +.rst-content { + max-width: 900px; +} + +.wy-nav-content .icon-home:before { + content: "Docs"; +} + +.wy-side-nav-search .icon-home:before { + content: ""; +} + +dl.field-list { + display: block !important; +} + +dl.field-list > dt:after { + content: "" !important; +} + +dl.field-list > dt { + display: table; + padding-left: 6px !important; + padding-right: 6px !important; + margin-bottom: 4px !important; + padding-bottom: 1px !important; + background: rgb(252, 237, 208); + border-left: solid 2px rgb(231, 181, 134); +} + + +dl.py.class>dt +{ + color: rgba(17, 16, 17, 0.822) !important; + background: rgb(247, 234, 252) !important; + border-top: solid 2px #b620d0 !important; +} + +dl.py.method>dt +{ + background: rgb(250, 239, 241) !important; + border-left: solid 2px rgb(199, 83, 106) !important; +} + +dl.py.attribute>dt, +dl.py.property>dt +{ + background: rgba(194, 233, 248, 0.1) !important; + border-left: solid 2px #58b5cc !important; +} + +.fa-plus-square-o::before, .wy-menu-vertical li button.toctree-expand::before, +.fa-minus-square-o::before, .wy-menu-vertical li.current > a button.toctree-expand::before, .wy-menu-vertical li.on a button.toctree-expand::before +{ + content: ""; +} + +.rst-content .viewcode-back, +.rst-content .viewcode-link +{ + font-size: 120%; +} + + diff --git a/OpenDelta-0.3.2/docs/source/_static/js/custom.js b/OpenDelta-0.3.2/docs/source/_static/js/custom.js new file mode 100644 index 0000000..489b7d5 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/_static/js/custom.js @@ -0,0 +1,7 @@ +document.addEventListener("DOMContentLoaded", function(event) { + document.querySelectorAll(".wy-menu.wy-menu-vertical > ul.current > li > a").forEach(a => a.addEventListener("click", e=>{ + f = document.querySelector(".wy-menu.wy-menu-vertical > ul.current > li > ul") + if (f.style.display=='none') { f.style.display='block'; } else f.style.display = 'none' + })); + document.querySelectorAll(".headerlink").forEach(a => a.text="\u{1F517}"); +}); \ No newline at end of file diff --git a/OpenDelta-0.3.2/docs/source/conf.py b/OpenDelta-0.3.2/docs/source/conf.py new file mode 100644 index 0000000..8a94518 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/conf.py @@ -0,0 +1,147 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) +import sys +sys.path.insert(0, "../../") +import datetime +import sphinx_rtd_theme +import doctest +import opendelta + + + + +# -- Project information ----------------------------------------------------- + +project = 'OpenDelta' +author = 'THUNLP OpenDelta Team' +copyright = '{}, {}, Licenced under the Apache License, Version 2.0'.format(datetime.datetime.now().year, author) + + +# The full version, including alpha/beta/rc tags +release = '0.3.2' +version = "0.3.2" + +html_theme = 'sphinx_rtd_theme' +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +doctest_default_flags = doctest.NORMALIZE_WHITESPACE +autodoc_member_order = 'bysource' +intersphinx_mapping = {'python': ('https://docs.python.org/', None), +"torch": ("https://pytorch.org/docs/stable/", None),} + +html_show_sourcelink = True + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.doctest', + 'sphinx.ext.intersphinx', + # 'sphinx.ext.mathbase', + 'sphinx.ext.mathjax', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'sphinx.ext.githubpages', + 'sphinx_copybutton', + 'sphinx_toolbox.collapse', + 'myst_parser', +] + +myst_enable_extensions = [ + "html_image", + "colon_fence", + "html_admonition", + "amsmath", + "dollarmath", +] + +source_suffix = { + '.rst': 'restructuredtext', + '.txt': 'markdown', + '.md': 'markdown', +} + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +# exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +# html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_theme_options = { + # 'collapse_navigation': False, + # 'display_version': True, + #'logo_only': False, + 'navigation_depth': 2, +} + + +html_static_path = ['_static'] +html_css_files = ['css/custom.css'] +html_js_files = ['js/custom.js'] +rst_context = {'opendelta': opendelta} +# rst_epilog = "\n.. include:: .special.rst\n" +add_module_names = False + +def include_only_tagged(app, what, name, obj, skip, options): + inclusion_tag_format = "[NODOC]" #can be any pattern here, choose what works for you + for tag in app.tags.tags: + if obj.__doc__ is not None and not obj.__doc__.startswith(inclusion_tag_format): + return False + return True + +def skip2(app, what, name, obj, skip, options): + members = [ + '__init__', + '__repr__', + '__weakref__', + '__dict__', + '__module__', + ] + return True if name in members else skip + +def skip(app, what, name, obj, skip, options): + skip = include_only_tagged(app, what, name, obj, skip, options) or\ + skip2(app, what, name, obj, skip, options) + return skip + +def setup(app): + + + + def rst_jinja_render(app, docname, source): + src = source[0] + rendered = app.builder.templates.render_string(src, rst_context) + source[0] = rendered + + app.connect('autodoc-skip-member', skip) + app.connect("source-read", rst_jinja_render) \ No newline at end of file diff --git a/OpenDelta-0.3.2/docs/source/imgs/afterfreeze.png b/OpenDelta-0.3.2/docs/source/imgs/afterfreeze.png new file mode 100644 index 0000000..5d37408 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/afterfreeze.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/bart-base.png b/OpenDelta-0.3.2/docs/source/imgs/bart-base.png new file mode 100644 index 0000000..52b023d Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/bart-base.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/bert_vis.png b/OpenDelta-0.3.2/docs/source/imgs/bert_vis.png new file mode 100644 index 0000000..a64c175 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/bert_vis.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/bertdelta_noparam.png b/OpenDelta-0.3.2/docs/source/imgs/bertdelta_noparam.png new file mode 100644 index 0000000..3f1cdf7 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/bertdelta_noparam.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/bertdelta_vis.png b/OpenDelta-0.3.2/docs/source/imgs/bertdelta_vis.png new file mode 100644 index 0000000..e21cf20 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/bertdelta_vis.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/commonstructure_vis.png b/OpenDelta-0.3.2/docs/source/imgs/commonstructure_vis.png new file mode 100644 index 0000000..e5db4e1 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/commonstructure_vis.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/composition_of_delta.png b/OpenDelta-0.3.2/docs/source/imgs/composition_of_delta.png new file mode 100644 index 0000000..b33a060 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/composition_of_delta.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/defaultmodification.png b/OpenDelta-0.3.2/docs/source/imgs/defaultmodification.png new file mode 100644 index 0000000..a729ccb Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/defaultmodification.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/demo.gif b/OpenDelta-0.3.2/docs/source/imgs/demo.gif new file mode 100644 index 0000000..2652756 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/demo.gif differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/hint-icon-2.jpg b/OpenDelta-0.3.2/docs/source/imgs/hint-icon-2.jpg new file mode 100644 index 0000000..0d9a0c6 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/hint-icon-2.jpg differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/hint-icon.png b/OpenDelta-0.3.2/docs/source/imgs/hint-icon.png new file mode 100644 index 0000000..83ebb44 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/hint-icon.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/interact.jpg b/OpenDelta-0.3.2/docs/source/imgs/interact.jpg new file mode 100644 index 0000000..0cbcee7 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/interact.jpg differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/multiple_to_one_layer.png b/OpenDelta-0.3.2/docs/source/imgs/multiple_to_one_layer.png new file mode 100644 index 0000000..1df3e24 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/multiple_to_one_layer.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/name_based_addressing.png b/OpenDelta-0.3.2/docs/source/imgs/name_based_addressing.png new file mode 100644 index 0000000..c341a3d Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/name_based_addressing.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/plugunplug1.png b/OpenDelta-0.3.2/docs/source/imgs/plugunplug1.png new file mode 100644 index 0000000..7dc17f1 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/plugunplug1.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/plugunplug2.png b/OpenDelta-0.3.2/docs/source/imgs/plugunplug2.png new file mode 100644 index 0000000..1330350 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/plugunplug2.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/plugunplug3.png b/OpenDelta-0.3.2/docs/source/imgs/plugunplug3.png new file mode 100644 index 0000000..3f6aa5d Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/plugunplug3.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/plugunplug4.png b/OpenDelta-0.3.2/docs/source/imgs/plugunplug4.png new file mode 100644 index 0000000..a0a6e24 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/plugunplug4.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/plugunplug5.png b/OpenDelta-0.3.2/docs/source/imgs/plugunplug5.png new file mode 100644 index 0000000..c7b2dcc Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/plugunplug5.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/plugunplug6.png b/OpenDelta-0.3.2/docs/source/imgs/plugunplug6.png new file mode 100644 index 0000000..7adf668 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/plugunplug6.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/pointing-right-finger.png b/OpenDelta-0.3.2/docs/source/imgs/pointing-right-finger.png new file mode 100644 index 0000000..6216065 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/pointing-right-finger.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/raw_print.png b/OpenDelta-0.3.2/docs/source/imgs/raw_print.png new file mode 100644 index 0000000..836b2de Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/raw_print.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/t5lora.png b/OpenDelta-0.3.2/docs/source/imgs/t5lora.png new file mode 100644 index 0000000..1d78cdb Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/t5lora.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/todo-icon.jpeg b/OpenDelta-0.3.2/docs/source/imgs/todo-icon.jpeg new file mode 100644 index 0000000..9846fc2 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/todo-icon.jpeg differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/toy-delta.png b/OpenDelta-0.3.2/docs/source/imgs/toy-delta.png new file mode 100644 index 0000000..ab32640 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/toy-delta.png differ diff --git a/OpenDelta-0.3.2/docs/source/imgs/transformers_structure.png b/OpenDelta-0.3.2/docs/source/imgs/transformers_structure.png new file mode 100644 index 0000000..ded54d9 Binary files /dev/null and b/OpenDelta-0.3.2/docs/source/imgs/transformers_structure.png differ diff --git a/OpenDelta-0.3.2/docs/source/index.md b/OpenDelta-0.3.2/docs/source/index.md new file mode 100644 index 0000000..2bb28bb --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/index.md @@ -0,0 +1,75 @@ +OpenDelta's documentation! +===================================== + +[OpenDelta](https://github.com/thunlp/OpenDelta/) is a **Plug-and-play** Library of the parameter-efficient fine-tuning ([delta-tuning](WhatisDelta)) technology for pre-trained models. + + +## Essential Advantages: + +- Clean: No need to edit the backbone PTM’s codes. +- Simple: Migrating from full-model tuning to delta-tuning needs as little as 3 lines of codes. +- Sustainable: Most evolution in external library doesn’t require a new OpenDelta. +- Extendable: Various PTMs can share the same delta-tuning codes. +- Flexible: Able to apply delta-tuning to (almost) any position of the PTMs. + +```{eval-rst} +.. toctree:: + :maxdepth: 1 + :caption: Getting Started + + notes/overview.md + notes/installation.md + notes/quickstart.md + notes/custom.md + + +.. toctree:: + :maxdepth: 1 + :caption: Advanced Usage + + notes/autodelta.md + notes/deltacenter.md + notes/composition.md + notes/pluginunplug.md + notes/withbmtrain.md + notes/withaccelerate.md + notes/examples.md + +.. toctree:: + :maxdepth: 1 + :caption: Utilities + + notes/inspect.md + +.. toctree:: + :maxdepth: 1 + :caption: Mechanisms + + notes/keyfeature.md + notes/namebasedaddr.md + notes/unifyname.md + +.. toctree:: + :maxdepth: 1 + :caption: Information + + notes/citation.md + notes/update.md + notes/faq.md + +.. toctree:: + :maxdepth: 2 + :caption: Documentation + + modules/base + modules/deltas + modules/auto_delta + modules/utils + + +Indices and tables +================== + +* :ref:`genindex` + +``` \ No newline at end of file diff --git a/OpenDelta-0.3.2/docs/source/modules/auto_delta.rst b/OpenDelta-0.3.2/docs/source/modules/auto_delta.rst new file mode 100644 index 0000000..cc9d3d4 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/modules/auto_delta.rst @@ -0,0 +1,14 @@ +Auto Classes +====================================== + + +AutoDeltaConfig +------------------------------------ +.. autoclass:: opendelta.auto_delta.AutoDeltaConfig + :members: + + +AutoDeltaModel +------------------------------------ +.. autoclass:: opendelta.auto_delta.AutoDeltaModel + :members: diff --git a/OpenDelta-0.3.2/docs/source/modules/base.rst b/OpenDelta-0.3.2/docs/source/modules/base.rst new file mode 100644 index 0000000..3a1a35e --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/modules/base.rst @@ -0,0 +1,14 @@ +Base Classes +====================================== + + +BaseDeltaConfig +------------------------------------ +.. autoclass:: opendelta.delta_configs.BaseDeltaConfig + :members: + + +DeltaBase +------------------------------------ +.. autoclass:: opendelta.basemodel.DeltaBase + :members: diff --git a/OpenDelta-0.3.2/docs/source/modules/deltas.rst b/OpenDelta-0.3.2/docs/source/modules/deltas.rst new file mode 100644 index 0000000..5a94fb6 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/modules/deltas.rst @@ -0,0 +1,46 @@ +Delta Models +====================================== + + + +Lora +--------------------------------------- +.. autoclass:: opendelta.LoraModel + :members: + + + +BitFit +--------------------------------------- +.. autoclass:: opendelta.BitFitModel + :members: + + +Adapter +--------------------------------------- +.. autoclass:: opendelta.AdapterModel + :members: + + +LowRankAdapter +--------------------------------------- +.. autoclass:: opendelta.LowRankAdapterModel + :members: + + +Compacter +--------------------------------------- +.. autoclass:: opendelta.CompacterModel + :members: + + +Prefix tuning +------------------------------------ +.. autoclass:: opendelta.PrefixModel + :members: + + +Soft Prompt Tuning +------------------------------------ +.. autoclass:: opendelta.SoftPromptModel + :members: diff --git a/OpenDelta-0.3.2/docs/source/modules/utils.md b/OpenDelta-0.3.2/docs/source/modules/utils.md new file mode 100644 index 0000000..3d11305 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/modules/utils.md @@ -0,0 +1,45 @@ +# Utils + + +## SaveLoadMixin + +```{eval-rst} +.. autoclass:: opendelta.utils.saving_loading_utils.SaveLoadMixin + :members: +``` + +## Visualization + + +```{eval-rst} +.. autoclass:: opendelta.utils.visualization.Visualization + :members: +``` + +## Structure Map +```{eval-rst} +.. autoclass:: opendelta.utils.structure_mapping.CommonStructureMap + :members: +``` + +## Utility Functions + +### Hashing +```{eval-rst} +.. automodule:: opendelta.utils.model_md5 + :members: +``` + +### Signature +```{eval-rst} +.. automodule:: opendelta.utils.signature + :members: +``` + +### Named-based addressing +```{eval-rst} +.. automodule:: opendelta.utils.name_based_addressing + :members: +``` + + diff --git a/OpenDelta-0.3.2/docs/source/notes/autodelta.md b/OpenDelta-0.3.2/docs/source/notes/autodelta.md new file mode 100644 index 0000000..28f339a --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/autodelta.md @@ -0,0 +1,90 @@ +(autodelta)= +# AutoDelta Mechanism + +Inspired by [Huggingface transformers AutoClasses](https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/auto#transformers.AutoModel) , we provide an AutoDelta features for the users to + +1. Easily to experiment with different delta models +2. Fast deploy from configuration file, especially from the repos in [DeltaCenter](https://examplelink). + + +## Easily load from dict, so that subject to change the type of delta models. + +```python +from opendelta import AutoDeltaConfig, AutoDeltaModel +from transformers import T5ForConditionalGeneration + +backbone_model = T5ForConditionalGeneration.from_pretrained("t5-base") +``` + +We can load a config from a dict +```python +config_dict = { + "delta_type":"lora", + "modified_modules":[ + "SelfAttention.q", + "SelfAttention.v", + "SelfAttention.o" + ], + "lora_r":4} +delta_config = AutoDeltaConfig.from_dict(config_dict) +``` + +Then use the config to add a delta model to the backbone model +```python +delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=backbone_model) + +# now visualize the modified backbone_model +from bigmodelvis import Visualization +Visualizaiton(backbone_model).structure_graph() +``` + + +````{collapse} Click to view output +```{figure} ../imgs/t5lora.png +--- +width: 600px +name: t5lora +--- +``` +```` + + + +## Fast deploy from a finetuned delta checkpoints from DeltaCenter + +```python +# use tranformers as usual. +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer +t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large") +t5_tokenizer = AutoTokenizer.from_pretrained("t5-large") +# A running example +inputs_ids = t5_tokenizer.encode("Is Harry Poter wrtten by JKrowling", return_tensors="pt") +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> '? Is it Harry Potter?' +``` + +Load delta model from delta center: +```python +# use existing delta models +from opendelta import AutoDeltaModel, AutoDeltaConfig +# use existing delta models from DeltaCenter +delta = AutoDeltaModel.from_finetuned("thunlp/Spelling_Correction_T5_LRAdapter_demo", backbone_model=t5) +# freeze the whole backbone model except the delta models. +delta.freeze_module() +# visualize the change +delta.log() + +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> Is Harry Potter written by JK Rowling? +``` + +
+

**Hash check**

+Since the delta model only works together with the backbone model. +we will automatically check whether you load the delta model the same way it is trained. +

+

+We calculate the trained model's [md5](http://some_link) and save it to the config. When finishing loading the delta model, we will re-calculate the md5 to see whether it changes. +

Note that performance is guaranteed by passing the hash check, but there are cases where the hash check is not passed but performance is still normal for various reasons. We are checking the reasons for this. Please consider this feature as a supplement.

+

Pass `check_hash=False` to disable the hash checking.

+
\ No newline at end of file diff --git a/OpenDelta-0.3.2/docs/source/notes/citation.md b/OpenDelta-0.3.2/docs/source/notes/citation.md new file mode 100644 index 0000000..47a88ba --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/citation.md @@ -0,0 +1,12 @@ +# Citation + +If you find our repo useful, please cite the following paper. + +``` +@article{ding2022delta, + title={Delta tuning: A comprehensive study of parameter efficient methods for pre-trained language models}, + author={Ding, Ning and Qin, Yujia and Yang, Guang and Wei, Fuchao and Yang, Zonghan and Su, Yusheng and Hu, Shengding and Chen, Yulin and Chan, Chi-Min and Chen, Weize and others}, + journal={arXiv preprint arXiv:2203.06904}, + year={2022} +} +``` \ No newline at end of file diff --git a/OpenDelta-0.3.2/docs/source/notes/composition.md b/OpenDelta-0.3.2/docs/source/notes/composition.md new file mode 100644 index 0000000..a32db2c --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/composition.md @@ -0,0 +1,51 @@ +# Composition of delta models + +With OpenDelta, you can perform compostion of different delta models. + + +## Add different deltas to the backbone + +``` +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("roberta-base") +from opendelta import LoraModel, AdapterModel +delta_model = LoraModel(backbone_model=model, modified_modules=['key'], lora_r=1) +delta_model2 = AdapterModel(backbone_model=model, modified_modules=['output'], bottleneck_dim=12) +delta_model.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/composition_of_delta.png +--- +width: 600px +name: composition_of_delta +--- +``` +```` + + + +## Even add multiple delta to the same layer + +``` +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base") +from opendelta import AdapterModel, LowRankAdapterModel +delta_model = AdapterModel(backbone_model=model, modified_modules=['fc2']) +delta_model2 = AdapterModel(backbone_model=model, modified_modules=['fc2'], bottleneck_dim=12) +delta_model3 = LowRankAdapterModel(backbone_model=model, modified_modules=['fc2'], reduction_factor=12) +delta_model.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/multiple_to_one_layer.png +--- +width: 600px +name: multiple_to_one_layer +--- +``` +```` +:::{admonition} Order of Insertion +:class: warning +**When adding to the same layer, please pay attention to the order of adding delta.** As the above example, adapter is added after the `fc2`, the tensor will first go through `adapter` then go through `adapter_1`, at last `compacter`. If the delta is added before the backbone layer, then the last added delta will be the first to go through. + +Also, pay attention to the detaching order. The delta that is first added should be the last to be detached. +::: \ No newline at end of file diff --git a/OpenDelta-0.3.2/docs/source/notes/custom.md b/OpenDelta-0.3.2/docs/source/notes/custom.md new file mode 100644 index 0000000..a41ff66 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/custom.md @@ -0,0 +1,135 @@ +# Custom Usage +Now we introduce the pipeline to migrate your full-model tuning scripts to a delta tuning one, **especial when your model is not in the default configuration list, or you don't want to use ghte default configuration**. + +## STEP 1: Load the pretrained models + +```python +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base") # suppose we load BART +``` + +## STEP 2: Add delta modules +We provide two alternatives to add the delta modules. +### 2.1 Visualize the backbone structure +Delta tuning's core change in the structure of the base model is to decorate (modify) the modules of the base model with small delta modules. We assume we want to treat the feedforward layer of each block as our [target modules](targetmodules). Since **different PLM name the submodules differently**, +We should first know the name of the feedforward layer in the BART model by visualization. *For more about visualization, see [Visualization](visualization).* + +```python +from bigmodelvis import Visualization +Visualization(model).structure_graph() +``` + +````{collapse} Click to view output +```{figure} ../imgs/bart-base.png +--- +width: 600px +name: bart-base +--- +``` +```` + + +We can see from the structure graph that the feed forward layer in Bart is called `model.encoder.layers.$.fc1` and `model.encoder.layers.$.fc2`, where +`$` represent a number from 0-5. Since we want to apply adapter after *all* the feed forward layers, we specify the `modified_modules=['fc2']`, which is the common suffix for feed forward layers. + *For details about the name based addressing, see [Name-based submodule addressing](namebasedaddr)* + +Other configurations, such as the `bottleneck_dim` in Adapter, can be passed as key word arguments. +```python +from opendelta import AdapterModel +delta_model = AdapterModel(backbone_model=model, modified_modules=['fc2'], bottleneck_dim=12) +delta_model.log() # This will visualize the backbone after modification and other information. +``` + + +:::{admonition} Try different positions +:class: tip +OpenDelta provide the flexibility to add delta to various positions on the backbone model. For example, If you want to move the adapter in the above example after the layer norm of the feed forward layer. The code should be changed into +```python +delta_model = AdapterModel(backbone_model=model, modified_modules=['final_layer_norm'], bottleneck_dim=12) +``` +The performance may vary due to positional differences, but there is currently theorectical guarantee that one will outperform the other. +::: + + +:::{admonition} Favored Configurations +:class: tip +Feel confused about the flexibility that OpenDelta brings? The default configuration is the `default_modified_modules` attributes of each Delta model. Generally, the default configurations are already good enough. If you want squeeze the size of delta models further, you can refer to the following papers. + + - [AdapterDrop: On the Efficiency of Adapters in Transformers](https://arxiv.org/abs/2010.11918) + - [Sparse Structure Search for Parameter-Efficient Tuning(Delta Tuning)](https://arxiv.org/abs/2206.07382) +::: + +## STEP 3: Freeze parameters +So far the backbone model is still fully tunable. To freeze the main part of the backbone model except the trainable parts (usually the delta paramters), use [freeze_module](opendelta.basemodel.DeltaBase.freeze_module) method. The syntax of `exclude` field also obeys the [name-based addressing](namebasedaddr) rules. + + +```python +delta_model.freeze_module(exclude=["deltas", "layernorm_embedding"]) +delta_model.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/afterfreeze.png +--- +width: 600px +name: afterfreeze +--- +``` +```` + +Usually, we want to only save the trainable part, then we should modify the `state_dict` of the backbone model which original contains all the parameters. Now with `set_state_dict=True`, the `model.state_dict()` only contains the trainable parameters. +```python +delta_model.freeze_module(exclude=["deltas", "layernorm_embedding"], set_state_dict=True) +``` + + + + + +## STEP 4: Normal training pipeline + +The **model** then can be trained in traditional training scripts. Two things should be noticed: + +:::{admonition} Note +:class: note +1. No need to change the optimizer, since the optimizer will only calculated and store gradient for those parameters with `requires_grad=True`, and the `requires_grad` attribute has been changed during the call to [freeze_module](opendelta.basemodel.DeltaBase.freeze_module) method. +2. `model.eval()` or `model.train()` should be used if we need to enable/disable dropout. Opendelta doesn't touch those configuration. +::: + + +## STEP 5: Save and load the Delta Model +### Option1: Use opendelta interface. +One option is to use our provided interface. This will save both the configurations of the delta model and the parameters of all trainable parameters. +```python +delta_model.save_finetuned("some_local_path/") +``` +When loading the delta_model, just call the `from_finetuned` methods. Note that the loaded model is fully trainable. If you want to continue to train it, please use `freeze_module` again. +```python +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base") +from opendelta import AutoDeltaModel +delta_model = AutoDeltaModel.from_finetuned("some_local_path/", backbone_model=model) +``` + +### Option2: Use pytorch interface. +Another option is to load the model using traditional pytorch ways. +```python +torch.save(model.state_dict(), "some_local_path/pytorch_model.bin") +``` +Then load it into an initialied backbone model with delta model. Remember to use `strict=False` since now the state_dict contains only the trainable parameters. + +```python +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base") +from opendelta import AdapterModel +delta_model = AdapterModel(backbone_model=model, modified_modules=['fc2'], bottleneck_dim=12) +model.load_state_dict(torch.load("some_local_path/pytorch_model.bin"), strict=False) +``` + +### Option3: Save and upload to DeltaCenter. +You can also save the delta model to delta center to share with the community. See [instructions](deltacenter). + + + + + + diff --git a/OpenDelta-0.3.2/docs/source/notes/deltacenter.md b/OpenDelta-0.3.2/docs/source/notes/deltacenter.md new file mode 100644 index 0000000..d344023 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/deltacenter.md @@ -0,0 +1,35 @@ +# DeltaCenter + +## Share to Delta Center. +```python +delta_model.save_finetuned("test_delta_model", push_to_dc = True) +``` + +## Download from Delta Center. +```python +# use tranformers as usual. +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer +t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large") +t5_tokenizer = AutoTokenizer.from_pretrained("t5-large") +# A running example +inputs_ids = t5_tokenizer.encode("Is Harry Poter wrtten by JKrowling", return_tensors="pt") +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> '? Is it Harry Potter?' +``` + +Load delta model from delta center: +```python +# use existing delta models +from opendelta import AutoDeltaModel, AutoDeltaConfig +# use existing delta models from DeltaCenter +delta = AutoDeltaModel.from_finetuned("thunlp/Spelling_Correction_T5_LRAdapter_demo", backbone_model=t5) +# freeze the whole backbone model except the delta models. +delta.freeze_module() +# visualize the change +delta.log() + +t5_tokenizer.decode(t5.generate(inputs_ids)[0]) +# >>> Is Harry Potter written by JK Rowling? +``` + + diff --git a/OpenDelta-0.3.2/docs/source/notes/examples.md b/OpenDelta-0.3.2/docs/source/notes/examples.md new file mode 100644 index 0000000..fb346a2 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/examples.md @@ -0,0 +1,16 @@ +# Examples + +## examples_prompt +| | Lora | Bias
Tuning | Adapter
Houstbly | Adapter
Preffier | Adapter
Drop | Adapater
Low-Rank | Compactor |Prefix
Tuning | Prompt
Tuning | +| --------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ----- | ----- | +| T5 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | +| BART | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | +| DistilBERT | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | +| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | +| BERT | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| T5-3b(parallel)| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| Deberta-v2 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | +| CTRL | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | + +## tutorials \ No newline at end of file diff --git a/OpenDelta-0.3.2/docs/source/notes/faq.md b/OpenDelta-0.3.2/docs/source/notes/faq.md new file mode 100644 index 0000000..164c3a0 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/faq.md @@ -0,0 +1,14 @@ +# FAQs + +1. **Why I encounder NotImplementedError in Prefix Tuning?** + + This is because we find no easy way to get a unified Prefix Tuning implementation for different attention classes. If you really want to use Prefix Tuning for the models we have not supported, you can implement the ``PrefixLayerYOURMODEL`` on your own or raise a issue to request the feature for your model. + +2. **Available Models with default configurations are ..., Please manually add the delta models by speicifying 'modified_modules' based on the visualization of your model structure** + + Although most pre-trained models (PTMs) use the transformers archtecture, they are implemented differently. For example, the attention module in GPT2 and BERT is not only named differently, but also implemented in different ways. Common structure mapping mapps the different name conventions of different PTMs into a unified name convention. But there are many PTMs that we do not currently cover. But don't worry! For these models, you can figure out which modules should you modify by simply [visualizing the PTMs](visualization), and then specify the `modified modules` manually (See [name-based addressing](namebasedaddr)). + + +3. **Requires a dummy_inputs to be passed through the model to understand the dimensionality of each tensor in the computation graph. The {module.__class__.__name__} Class has no dummy_inputs, and automatically created dummy_inputs failed.** + + The `dummy_inputs` can be any data that make `backbone_model.forward(**dummy_inputs)` succeed. Only the form and shape of the `dummy_inputs` matter. To set dummy_inputs for your model, please use: `setattr(backbone_model, 'dummy_inputs', some_dummy_inputs)` before initializing `{self.__class__.__name__}`. \ No newline at end of file diff --git a/OpenDelta-0.3.2/docs/source/notes/inspect.md b/OpenDelta-0.3.2/docs/source/notes/inspect.md new file mode 100644 index 0000000..53ca466 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/inspect.md @@ -0,0 +1,129 @@ + +(visualization)= +# Visualize the Parameters + +When OpenDelta makes modifications to a pretrained model (PTM), it is beneficial to know what your PTM looks like, especially the location of the parameters. + +- **Before** applying opendelta, you can know **how to specify your modifications in terms of key addressing**. +- **After** the modification is done, you can know **if your modification is what you expected**, for example, whether the position of the delta +modules are desired, or whether you froze the correct parameters. + +Now let's begin to try the visualization utility. + +## Visualization is NOT easy using pytorch native function. + +```python +from transformers import BertForMaskedLM +backbone_model = BertForMaskedLM.from_pretrained("bert-base-uncased") +print(backbone_model) +``` + +````{collapse} Click to view output +```{figure} ../imgs/raw_print.png +--- +width: 600px +name: raw_print +--- +``` +```` + +The original presentation of models is **not tailored for repeated structures, big models, or parameters-centric tasks**. + + +## Using visualization from bigmodelvis. + +First let's visualize all the parameters in the bert model. As we can see, structure inside a bert model, and the all the paramters location of the model are neatly represented in tree structure. (See [color scheme](color_schema) for the colors) + +```python +from bigmodelvis import Visualization +model_vis = Visualization(backbone_model) +model_vis.structure_graph() +``` + + +```{figure} ../imgs/bert_vis.png +--- +width: 600px +name: bert_vis +--- +``` + + + +
+

**Suggestion**

+We can reference a module according to the graph easily: +```python +print(backbone_model.bert.encoder.layer[0].intermdiate) +``` +When using opendelta on a new backbone model, it's better to first visualize the child module names (shown in white), and then designating the `modified_modules`. +
+ + + + +## Now add a delta model and visualize the change. + + +```python +from opendelta import LowRankAdapterModel +delta_model = LowRankAdapterModel(backbone_model) +delta_model.freeze_module(exclude=["cls", "intermediate", "LayerNorm"]) +Visualization(backbone_model).structure_graph() +``` + +````{collapse} Click to view output +```{figure} ../imgs/bertdelta_vis.png +--- +width: 600px +name: bertdelta_vis +--- +``` +```` + +(color_schema)= +
+
**Color Schema**
+
    +
  • The white part is the name of the module.
  • +
  • The green part is the module's type.
  • +
  • The blue part is the tunable parameters, i.e., the parameters that require grad computation.
  • +
  • The grey part is the frozen parameters, i.e., the parameters that do not require grad computation.
  • +
  • The red part is the structure that is repeated and thus folded.
  • +
  • The purple part is the delta parameters inserted into the backbone model.
  • +
+
+ +:::{admonition} PlatForm Sentivity +:class: warning +Depending on the platform the code is running on, the colors may vary slightly. +::: + + + + +## We also provide the option to visualize the nodes without parameters. + +```python +Visualization(backbone_model).structure_graph(keep_non_params=True) +``` + +Thus, the modules like dropout and activations are kept. + + +````{collapse} Click to view output +```{figure} ../imgs/bertdelta_noparam.png +--- +width: 600px +name: bertdelta_noparam +--- +``` +```` + +:::{admonition} Order of the submodule +:class: warning +Currently, OpenDelta‘s Visualization visualize the model based on pytorch's named_modules method. That means the order of the presented submodule is the order they are add to the parent module, not necessarily the order that tensors flows through. +::: + + +# Inspect the optimizer \ No newline at end of file diff --git a/OpenDelta-0.3.2/docs/source/notes/installation.md b/OpenDelta-0.3.2/docs/source/notes/installation.md new file mode 100644 index 0000000..cbdf63e --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/installation.md @@ -0,0 +1,31 @@ + +(installation)= +# Installation + + +The lasted version of OpenDelta is tested on on [Python 3.8](https://www.python.org/) and [Pytorch 1.12](). Other versions are likely to be supported as well. + + +## install the lastest version +```bash +pip install git+https://github.com/thunlp/OpenDelta.git +``` + +## install the lastest pip version (more stable) +```bash +pip install opendelta +``` + +## build from source +```bash +git clone git@github.com:thunlp/OpenDelta.git +cd OpenDelta +``` +then +``` +python setup.py install +``` +or if you want to do some modifications on the code for your research: +``` +python setup.py develop +``` \ No newline at end of file diff --git a/OpenDelta-0.3.2/docs/source/notes/keyfeature.md b/OpenDelta-0.3.2/docs/source/notes/keyfeature.md new file mode 100644 index 0000000..dc71d82 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/keyfeature.md @@ -0,0 +1,68 @@ +(keyfeature)= +# Philosophy and Key Features + +:::{admonition} Plug-and-play Design. +:class: tip + +Existing open-source project to propogate this **''delta-tuning''** paradigm includes +AdapterHub, which copies the transformers code base and modify on it, which makes it unintuitive to transfer from a normal code base to a delta-tuning ones. + +OpenDelta approaches this problem via a **true plug-and-play** fashion to the PLMs. To migrate from a full-model finetuning training scripts to a delta tuning training scripts, you **DO NOT** need to change the backbone bone model code base to an adapted code base. +::: + + +Here is how we achieve it. + + **Read through it will also help you to implement your own delta models in a sustainable way.** + + +## 1. Name-based submodule addressing. +See [name based addressing](namebasedaddr) +## 2. Three basic submodule-level delta operations. +We use three key functions to achieve the modifications to the backbone model outside the backbone model's code. + +1. **unfreeze some paramters** + + Some delta models will unfreeze a part of the model parameters and freeze other parts of the model, e.g. [BitFit](https://arxiv.org/abs/2106.10199). For these methods, just use [freeze_module](opendelta.basemodel.DeltaBase.freeze_module) method and pass the delta parts into `exclude`. + +2. **replace an module** + + Some delta models will replace a part of the model with a delta model, i.e., the hidden states will no longer go through the original submodules. This includes [Lora](https://arxiv.org/abs/2106.09685). + For these methods, we have an [update_module](opendelta.basemodel.DeltaBase.replace_module) interface. + +3. **insertion to the backbone** + + - **sequential insertion** + + Most adapter model insert a new adapter layer after/before the original transformers blocks. For these methods, insert the adapter's forward function after/before the original layer's forward function using [insert_sequential_module](opendelta.basemodel.DeltaBase.insert_sequential_module) interface. + - **parallel insertion** + + Adapters can also be used in a parallel fashion (see [Paper](https://arxiv.org/abs/2110.04366)). + For these methods, use [insert_parallel_module](opendelta.basemodel.DeltaBase.insert_parallel_module) interface. + + +:::{admonition} Doc-preserving Insertion +:class: note +In the insertion operations, the replaced forward function will inherit the doc strings of the original functions. +::: + +## 3. Pseudo input to initialize. +Some delta models, especially the ones that is newly introduced into the backbone, will need to determine the parameters' shape. To get the shape, we pass a pseudo input to the backbone model and determine the shape of each delta layer according to the need of smooth tensor flow. + +:::{admonition} Pseudo Input +:class: warning +Most models in [Huggingface Transformers](https://huggingface.co/docs/transformers/index) have an attribute [dummy_inputs](https://github.com/huggingface/transformers/blob/v4.16.2/src/transformers/modeling_utils.py#L464). This will create a nonsensical input with the correct format to pass into the model's forward function. + +For the models that doesn't inherit/implement this attributes, we assume the pseudo input to the model is something like `input_id`, i.e., an integer tensor. +```python +pseudo_input = torch.tensor([[0,0,0]]) +# or +pseudo_input = torch.tensor([0,0,0]) +``` + We will add interface to allow more pseudo input in the future. +::: + + + + + diff --git a/OpenDelta-0.3.2/docs/source/notes/namebasedaddr.md b/OpenDelta-0.3.2/docs/source/notes/namebasedaddr.md new file mode 100644 index 0000000..7dc8c4e --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/namebasedaddr.md @@ -0,0 +1,185 @@ + +# Name-based Addressing + +Named based addressing is what set OpenDelta apart from other packages and provide the possibility to be used to a broader range of models (even emerging ones). + + +## Name of a submodule. +We locate the submodules that we want to apply a delta layer via name-based addressing. + +In pytorch fashion, a submodule can be accessed from a root model via 'dot' addressing. For example, we define a toy language model + +```python +import torch.nn as nn +class MyNet1(nn.Module): + def __init__(self,): + super().__init__() + self.name_a = nn.Linear(5,5) + def forward(self, hiddens): + return self.name_a(hiddens) + +class MyNet2(nn.Module): + def __init__(self,): + super().__init__() + self.embedding = nn.Embedding(10,5) + self.name_b = nn.Sequential(MyNet1(), MyNet1()) + def forward(self, input_ids): + hiddens = self.embedding(input_ids) + return self.name_b(hiddens) + +root = MyNet2() +print(root.name_b[0].name_a) +# Linear(in_features=5, out_features=5, bias=True) +``` + +We can visualize the model (For details, see [visualization](visualization)) + +```python +from bigmodelvis import Visualization +Visualization(root).structure_graph() +``` + +````{collapse} Click to view output +```{figure} ../imgs/name_based_addressing.png +--- +width: 500px +name: name_based_addressing +--- +``` +```` + +In this case, string `"name_b.0.name_a"` will be the name to address the submodule from the root model. + +Thus when applying a delta model to this toy net. + +```python +from opendelta import AdapterModel +AdapterModel(backbone_model=root, modified_modules=['name_b.0.name_a']) +Visualization(root).structure_graph() +``` + +````{collapse} Click to view output +```{figure} ../imgs/toy-delta.png +--- +width: 500px +name: toy-delta +--- +``` +```` + +(targetmodules)= +## Target modules. + +For different delta methods, the operation for the modification target is different. +- Adapter based method: Insert at the target module's forward function. +- BitFit: Add bias to all allowed position of the target module. +- Lora: Substitute the all the linear layers of the target module with [Lora.Linear](https://github.com/microsoft/LoRA/blob/main/loralib/layers.py#L92). +- Prefix Tuning: the target module must be an attention module. + +:::{admonition} Auto Searching +:class: note +We are working on unifying operations to automatically search within a given module for its submodules that can be applied using a specific delta method. +::: + +## Makes addressing easier. + +Handcrafting the full names of submodules can be frustrating. We made some simplifications + +1. **End-matching** Rules. + + OpenDelta will take every modules that + **ends with** the provided name suffix as the modification [target module](targetmodules). + :::{admonition} Example + :class: tip + Taking DistilBert with an classifier on top as an example: + - set to `["0.attention.out_lin"]` will add delta modules to the attention output of distilbert's + ayer 0, i.e., `distilbert.transformer.layer.0.attention.out_lin`. + - set to `["attention.out_lin"]` will add the delta modules in every layer's `attention.out_lin`. + ::: + + +(regexexpr)= +2. Regular Expression. + + We also support regex end-matching rules. + We use a beginning `[r]` followed by a regular expression to represent this rule, where `[r]` is used to distinguish it from normal string matching rules and has no other meanings. + + Taking RoBERTa with an classifier on top as an example: It has two modules named `roberta.encoder.layer.0.attention.output.dense` and `roberta.encoder.layer.0.output.dense`, which both end up with `output.dense`. To distinguish them: + + - set `'[r](\d)+\.output.dense'` using regex rules, where `(\d)+` match any layer numbers. This rule will match all `roberta.encoder.layer.$.output.dense`. where `$` represents all integer numbers, here in a 12-layer RoBERTa, it's 0-11. + + - set `'[r][0-5]\.attention'` will match only the 0-5 layers' attention submodule. + + - set `'attention.output.dense'` using ordinary rules, which only match `roberta.encoder.layer.0.attention.output.dense`. + + :::{admonition} Regex in Json Configs + :class: warning + In json, you should write `"\\."` instead of `"\."` for a real dot due to json parsing rules. That is + ``` + { + ... + "modified_moduls": ['[r][0-5]\\.attention'], + ... + } + ``` + ::: + + +3. Interactive Selection. + + We provide a way to interact visually to select modules needed. + + ```python + from transformers import BertForMaskedLM + model = BertForMaskedLM.from_pretrained("bert-base-cased") + # suppose we load BERT + + from opendelta import LoraModel # use lora as an example, others are same + delta_model = LoraModel(backbone_model=model, interactive_modify=True) + ``` + + by setting `interactive_modify`, a web server will be opened on local host, and the link will be print in the terminal, e.g., + + ``` + http://0.0.0.0:8888/ + ``` + + If on your local machine, click to open the link for interactive modification. + + If on remote host, you could use port mapping. For example, vscode terminal will automatically do port mapping for you, you can simply use `control/command + click` to open the link. + + You can change the port number in case the default port number is occupied by other program by setting `interactive_modify=port_number`, in which port_number is an integer. + + The web page looks like the following figure. + + ```{figure} ../imgs/interact.jpg + --- + width: 500px + name: interact web page + --- + ``` + + - By clicking on `[+]`/`[-]` to expand / collapse tree nodes. + + - By clicking on text to select tree nodes, **yellow dotted** box indicates the selection. + + - **Double** click on the pink `[*]` is an advanced option to unfold the repeated nodes. By default, modules with the same architecture are folded into one node and are marked in red, for example, the `BertLayer` of layers 0~11 in the above figure are in the same structure. Regular model changes will make the same changes to each layers. + + - If you want to change only a few of them, first double-click on `[*]`, then select the parts you want in the unfolded structure. + + - If you want to make the same change to all but a few of them, first select the common parts you want in the folded structure, then double-click on `[*]` to remove the few positions you don't need to change in the expanded structure. + + Click `submit` button on the top-right corner, then go back to your terminal, you can get a list of name-based addresses printed in the terminal in the following format, and these modules are being "delta". + + ``` + modified_modules: + [bert.encoder.layer.0.output.dense, ..., bert.encoder.layer.11.output.dense] + ``` + + +## Examples +Nothing works better than a few lively examples. +Comming Soon... + + + diff --git a/OpenDelta-0.3.2/docs/source/notes/overview.md b/OpenDelta-0.3.2/docs/source/notes/overview.md new file mode 100644 index 0000000..ccec827 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/overview.md @@ -0,0 +1,36 @@ +# What is Delta-tuning and Why OpenDelta? + +(WhatisDelta)= +:::{admonition} What is Delta? +:class: tip + +As Pre-trained language models (PLMs) have become the fundamental infrastructure on many NLP tasks and benchmarks, it is becoming increasingly clear from recent research that **larger models tend to lead to better performance**. However, large-scale PLMs also bring prohibitive adaptation costs when fine-tuning all the parameters of a model and retaining separate instances for different tasks. + +**Parameter-efficient model stimulation methods** thus have attracted researchers' eyes, which only tune a small fraction of model parameter while achieving comparable or even better performance than full-model fine-tuning, dubbed as "Delta-tuning". + +**Delta** thus means a small fraction $\Delta\Theta$ of parameters besides the pretrained models $\Theta_0$. + +\begin{gather*} +\Theta \sim \Theta_0\text{(frozen)} + \Delta\Theta\text{(tunable)} +\end{gather*} + +This open-source project implement several delta-tuning methods, which allows researchers and engineers to quickly migrate their codes from full-model tuning to delta-tuning without replace the backend (the implementation of the backbone PLM). +::: + + + +## Why OpenDelta? + +- Clean: No need to edit the backbone PTM’s codes. +- Simple: Migrating from full-model tuning to delta-tuning needs as little as 3 lines of codes. +- Sustainable: Most evolution in external library doesn’t require a new OpenDelta. +- Extendable: Various PTMs can share the same delta-tuning codes. +- Flexible: Able to apply delta-tuning to (almost) any position of the PTMs. + + +## Delta-tuning papers + + + + + diff --git a/OpenDelta-0.3.2/docs/source/notes/pluginunplug.md b/OpenDelta-0.3.2/docs/source/notes/pluginunplug.md new file mode 100644 index 0000000..dae80c8 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/pluginunplug.md @@ -0,0 +1,113 @@ +# Multitask Modeling using OpenDelta + +:::{admonition} Multitask Serving with Delta-tuning +:class: tip +A huge advange of Delta-tuning is that it can be used for multitask serving. +Imagine we have a pretrained model trained on a mix of data coming from multiple languages, e.g.,English, Chinese, and French. Now you want to have seperate models that specialise in Chinese, French, English. We can thus delta-tune three deltas on each language with small amount of additional language-specific data. During serving, when a Chinese sentence comes, you attach the "Chinese Delta", and next a French sentence comes, you detach the "Chinese Delta", and attach a "French Delta". +::: + +**Here is how to achieve multitask serving using OpenDelta.** + +```python +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base") +from opendelta import LoraModel +delta_model = LoraModel(backbone_model=model, modified_modules=['fc2']) +delta_model.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/plugunplug1.png +--- +width: 800px +name: plugunplug1 +--- +``` +```` + +Now we detach the deltas from the backbone +```python +delta_model.detach() +delta_model.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/plugunplug2.png +--- +width: 800px +name: plugunplug2 +--- +``` +```` + +We can reattach the deltas to the backbone +```python +delta_model.attach() +delta_model.log() +``` + +````{collapse} Click to view output +```{figure} ../imgs/plugunplug3.png +--- +width: 800px +name: plugunplug3 +--- +``` +```` + +:::{admonition} Independence of Different Delta Models +:class: note +Different delta models will be independent in detaching and attaching. +(But the visualization will not show all deltas in the backbone model.) +```python +# continue from the above example +from opendelta import AdapterModel +delta_model2 = AdapterModel(backbone_model=model, modified_modules=['fc1']) +delta_model2.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/plugunplug4.png +--- +width: 800px +name: plugunplug4 +--- +``` +```` + +detach the lora delta +```python +delta_model.detach() # detach the lora delta +delta_model.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/plugunplug5.png +--- +width: 800px +name: plugunplug5 +--- +``` +```` + +detach the adapter delta and reattach the lora delta +```python +delta_model2.detach() # detach the adapter delta +delta_model.attach() # reattach the lora delta +delta_model.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/plugunplug6.png +--- +width: 800px +name: plugunplug6 +--- +``` +```` +::: + + +:::{admonition} BitFit not supported +:class: warning + Currently detach is not suitable for BitFit, which modify the requires_grad property. Please wait for future releases. +::: + + + + diff --git a/OpenDelta-0.3.2/docs/source/notes/quickstart.md b/OpenDelta-0.3.2/docs/source/notes/quickstart.md new file mode 100644 index 0000000..9d6b24d --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/quickstart.md @@ -0,0 +1,38 @@ +(basics)= +# Quick Start +Now we introduce the most basic interface to migrate your full-model tuning scripts to a delta tuning one **on some commonly used PTMs or their derivative models** (the models that has the PTM as their submodule,e.g., BERTForSequenceClassification). [try in colab](https://colab.research.google.com/drive/1SB6W5B-2nKxOnkwHSIe3oGXZ7m53u_Vf?usp=sharing) + +```diff + from transformers import AutoModelForSequenceClassification + model = AutoModelForSequenceClassification.from_pretrained("bert-large-cased") + ++ from opendelta import AdapterModel ++ delta_model = AdapterModel(model) ++ delta_model.freeze_module(exclude=["deltas", "classifier"]) # leave the delta tuning modules and the newly initialized classification head tunable. ++ # delta_model.log() # optional: to visualize how the `model` changes. + + training_dataloader = get_dataloader() + optimizer, loss_function = get_optimizer_loss_function() + for batch in training_dataloader: + optimizer.zero_grad() + targets = batch.pop('labels') + outputs = model(**batch).logits + loss = loss_function(outputs, targets) + loss.backward() + optimizer.step() + print(loss) + +- torch.save(model.state_dict(), "finetuned_bert.ckpt") ++ delta_model.save_finetuned("finetuned_bert") +``` + +We currently support the following models and their derivative models in their default configurations. + +- BERT +- DeBERTa-v2 +- GPT2 +- OPT +- RoBERTa +- T5 + +For model not in the above list, please refer to more detailed [custom usage](custom). \ No newline at end of file diff --git a/OpenDelta-0.3.2/docs/source/notes/unifyname.md b/OpenDelta-0.3.2/docs/source/notes/unifyname.md new file mode 100644 index 0000000..f77fd68 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/unifyname.md @@ -0,0 +1,82 @@ +(commonstructure)= + +# Common Structure Mapping + +```{figure} ../imgs/transformers_structure.png +:width: 400px +:name: transformers_structure +``` + +Although different PTMs often share similar Transformers structures, the codebases, and most importantly, the variable names for each submodule, are quite different. + + + +On the one hand, we **encourage the users to first [visualize](visualization) the PTMs' structure and then determine the name of submoduels.** + +On the other hand, we designed a unified name convention of Transformer Structure, and provided several structure mapping from the original name to the unified name convention. + +In this section, we will illustrate the unified name convention and structure mapping. + + +## Common blocks in Transformers structure. + + +- embeddings (word embedding) +- encoder + - block + - $ (layer_id) + - attn + - q, k, v + - proj + - layer_norm + - ff + - w1 + - w2 + - layer_norm +- decoder (similar to encoder) +- lm_head + - proj + +Visualize bert-base using a common structure name: The submodules that are not common are grey. + +```{figure} ../imgs/commonstructure_vis.png +:width: 600px +:name: commonstructure_vis +``` + +(mappingexample)= +## Example + +Example of bert mapping: a tree with node names specified by "\_\_name\_\_" +```json +{ + "bert.embeddings.word_embeddings": {"__name__":"embeddings"}, + "bert.embeddings.position_embeddings": {"__name__":""}, + "bert.embeddings.token_type_embeddings": {"__name__":""}, + "bert.embeddings.LayerNorm": {"__name__":""}, + "bert.encoder": {"__name__":"encoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "attention": {"__name__":"attn", + "self.query": {"__name__":"q"}, + "self.key": {"__name__":"k"}, + "self.value": {"__name__":"v"}, + "output.dense": {"__name__":"proj"}, + "output.LayerNorm": {"__name__":"layer_norm"}, + }, + "output": {"__name__":"ff", + "dense": {"__name__":"w2"}, + "LayerNorm": {"__name__":"layer_norm"} + }, + "intermediate.dense": {"__name__":"ff.w1"}, + } + } + }, + "cls.predictions": {"__name__": "lm_head", + "transform.dense": {"__name__":""}, + "transform.LayerNorm": {"__name__":""}, + "decoder": {"__name__":"proj"}, + } +} +``` + diff --git a/OpenDelta-0.3.2/docs/source/notes/update.md b/OpenDelta-0.3.2/docs/source/notes/update.md new file mode 100644 index 0000000..748e341 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/update.md @@ -0,0 +1,35 @@ +# Update Logs and Known Issues + +## Version 0.3.2 +- We improve the docs. +- We support BMTrain to accelerate the training, and parallelize the training of models that are hard to fit in a single GPU. Check [tutorial/2_with_bmtrain.py](https://github.com/thunlp/OpenDelta/tree/main/examples/tutorial/2_with_bmtrain.py) +- We add a functionality to [inspect the optimizer](https://github.com/thunlp/OpenDelta/tree/main/opendelta/utils/inspect.py). The user can see the number of trainable parameters in the optimizer and verify that opendelta is being used correctly. +- We move the functions to inspect the delta models into [inspect.py](https://github.com/thunlp/OpenDelta/tree/main/opendelta/utils/inspect.py) + +## Version 0.3.1 +- We update [must_try.py](https://github.com/thunlp/OpenDelta/tree/main/examples/unittest/must_try.py) for a simple introduction of the core functionality of OpenDelta. +- Thanks to [Weilin Zhao](https://github.com/Achazwl) We merge a long-developed branch parallel_adapter into the main branch. + + +## Version 0.3.0 +### Updates: +- Add this changelog for a granular record of updates. +- The default configuration of delta models can be applied to more wrapped models. + - There is less need to configure 'modified_modules' for wrapped models like [BertForSequenceClassification](https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification) or even [OpenMatch.DRModel](https://github.com/OpenMatch/OpenMatch/blob/master/src/openmatch/modeling/dense_retrieval_model.py#L37), as long as it has a model we support default configuration inside. **Note that if you customize `modified_modules` by yourself, most pytorch models are supported.** +- LoRA and BitFit models now does not need pseudo data to instantiate the model. +- BitFit models can now support [Conv1D](https://huggingface.co/docs/transformers/v4.23.1/en/internal/modeling_utils#transformers.Conv1D) using default configuration. +- Improve type hint for AutoDeltaModel. +- Fix bugs in documentation. +- Fix small bugs when saving a model without a config attributes. +- Make the default modified modules of adapter-like methods more accurate: attach the adapter-like modules after the output of attention layer and second feed-forward layer, both before the layernorm layers. +- A simple unit test folder containing development-time tests has been added for interested users. + + +### Known Issues +- SoftPrompt is still not supported for wrapped model if the model has no attribute `get_input_embeddings`. +- Prefix Tuning is still limited to T5, GPT2, Bart, Bert, Roberta. + +## Version 0.2.4 +### Updates +- examples/examples_seq2seq and examples/examples_text-classification is depreciated and moved to [legacy](https://github.com/thunlp/OpenDelta/tree/main/examples/legacies) +- Thanks to [Zhen Zhang](https://github.com/namezhenzhang), we provide [examples_prompt](https://github.com/thunlp/OpenDelta/tree/main/examples/examples_prompt), as a cleaner and more general framework, which unifies the delta tuning paradigm and the prompt-tuning paradigm. It is still based on [Huggingface Trainers](https://huggingface.co/docs/transformers/main_classes/trainer). In this example framework, the running pipeline is [a unified script](https://github.com/thunlp/OpenDelta/tree/main/examples/examples_prompt/src), the differences in tasks, models, delta tuning models, and even prompt-tuning paradigms are [more modular and be more independent ](https://github.com/thunlp/OpenDelta/tree/main/examples/examples_prompt/backbones). Please try it out! \ No newline at end of file diff --git a/OpenDelta-0.3.2/docs/source/notes/withaccelerate.md b/OpenDelta-0.3.2/docs/source/notes/withaccelerate.md new file mode 100644 index 0000000..0934f16 --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/withaccelerate.md @@ -0,0 +1,3 @@ + +# OpenDelta + Huggingface Accelerate + \ No newline at end of file diff --git a/OpenDelta-0.3.2/docs/source/notes/withbmtrain.md b/OpenDelta-0.3.2/docs/source/notes/withbmtrain.md new file mode 100644 index 0000000..4ae1a0e --- /dev/null +++ b/OpenDelta-0.3.2/docs/source/notes/withbmtrain.md @@ -0,0 +1,12 @@ + +(acceleration)= +# OpenDelta + BMTrain + +- [BMTrain](https://github.com/OpenBMB/BMTrain) is an efficient large model training toolkit that can be used to train large models with tens of billions of parameters. It can train models in a distributed manner while keeping the code as simple as stand-alone training. +- [ModelCenter](https://github.com/OpenBMB/ModelCenter) implements pre-trained language models (PLMs) based on the backend OpenBMB/BMTrain. ModelCenter supports Efficient, Low-Resource, Extendable model usage and distributed training. + +Now we have the LoraModel, AdapterModel, CompacterModel, ParallelAdapterModel, LowRankAdapterModel fully supported the distributed training with BMTrain and ModelCenter. + +Pass `backend='bmt'` in config or delta model initialization to enable `bmtrain`. + + diff --git a/OpenDelta-0.3.2/examples/README.md b/OpenDelta-0.3.2/examples/README.md new file mode 100644 index 0000000..1d4da65 --- /dev/null +++ b/OpenDelta-0.3.2/examples/README.md @@ -0,0 +1,25 @@ +# Use Examples + +This repo mainly contains several running scripts to use OpenDelta to conduct parameter-efficient training of various tasks. + +**Note that we suggest adding OpenDelta to existing scripts, instead of modify a scripts into the following examples. OpenDelta itself doens't restrict the training pipeline nor provide pipeline.** + + +## tutorial +Several toy tutorials: +1. The scripts for docs/basic_usage +2. Using interactive module selection +3. Work with [OpenPrompt](https://github.com/thunlp/OpenPrompt) + +## examples_text-classification +Modify a huggingface text-classification examples into a delta tuning one. +Currently, GLUE datasets are supported in the scripts. Roberta-base is used for performance checking. Read README.md inside the repo for detailed usage. + +## examples_seq2seq +Modify a huggingface sequence to sequence examples into a delta tuning one. +Currently, SuperGLUE and GLUE datasets are supported in the scripts. T5-base is used for performance checking. Read README.md inside the repo for detailed usage. + + +## examples_image-classification +A toy example of using OpenDelta for a Computer Vision Pretrained Model (ViT). Since ViT is an experimental feature in huggingface transformers, this example is subject to Change at any moment. + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/README.md b/OpenDelta-0.3.2/examples/examples_prompt/README.md new file mode 100644 index 0000000..d6b3329 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/README.md @@ -0,0 +1,59 @@ +# Examples of using opendelta together with 🤗 transformers. + +In this repo, we construct a very general pipeline to train and test a PLM using +🤗 transformers. + +The pipeline was constructed together with [openpromptu](https://pypi.org/project/openpromptu/), which is a light and +model-agnostic version of [openprompt](https://github.com/thunlp/OpenPrompt). + +## Pool of PLMs +We are going to adapt most of the models in 🤗 transformers +in the repos. The different pipeline, processing, or configurations are specified +in `./backbones/`. You can add your own model in this file to support customized models. + + +### A example script to run the repo in offline mode +```bash +conda activate [YOURENV] +PATHBASE=[YOURPATH] + +JOBNAME="adapter_t5-base" +DATASET="superglue-cb" + +cd $PATHBASE/OpenDelta/examples/examples_prompt/ +python configs/gen_t5.py --job $JOBNAME + +export TRANSFORMERS_OFFLINE=1 +export HF_DATASETS_OFFLINE=1 +python src/run.py configs/$JOBNAME/$DATASET.json \ +--model_name_or_path [YOURPATH_TO_T5_BASE] \ +--tokenizer_name [YOURPATH_TO_T5_BASE] \ +--datasets_saved_path [YOURPATH_TO_CB_DATASETS] \ +--finetuned_delta_path ${PATHBASE}/delta_checkpoints/ \ +--num_train_epochs 20 \ +--bottleneck_dim 24 \ +--delay_push True +``` + +## A example of quick testing the repo. + +```bash +conda activate [YOURENV] +PATHBASE=[YOURPATH] + +JOBNAME="adapter_t5-base" +DATASET="superglue-cb" + +cd $PATHBASE/OpenDelta/examples/examples_prompt/ + +export TRANSFORMERS_OFFLINE=1 +export HF_DATASETS_OFFLINE=1 +export DELTACENTER_OFFLINE=0 +python src/test.py configs/$JOBNAME/$DATASET.json \ +--model_name_or_path [YOURPATH_TO_T5_BASE] \ +--tokenizer_name [YOURPATH_TO_T5_BASE] \ +--datasets_saved_path [YOURPATH_TO_CB_DATASETS] \ +--finetuned_delta_path thunlp/t5-base_adapter_superglue-cb_20220701171436c80 \ +--delta_cache_dir "./delta_checkpoints/" \ +--force_download True +``` \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/__init__.py b/OpenDelta-0.3.2/examples/examples_prompt/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/OpenDelta-0.3.2/examples/examples_prompt/backbones/bart.py b/OpenDelta-0.3.2/examples/examples_prompt/backbones/bart.py new file mode 100644 index 0000000..6b9dd92 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/bart.py @@ -0,0 +1,179 @@ + +from openpromptu.data_utils import InputExample +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer +from transformers import ( + AutoConfig, + AutoModelForSeq2SeqLM, + AutoTokenizer, +) +from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator +import torch + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.mask_token + +def get_remove_columns(dataset_features): + return dataset_features + +def preprocess_function(raw_example, **kwargs): + # max_target_length += 1 + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + split = kwargs['split'] + example = InputExample(**raw_example) + + + + example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=256, + padding="max_length", truncation=True) + + + + with tokenizer.as_target_tokenizer(): + label = tokenizer(other['tgt_text']).input_ids + + model_inputs["labels"] = label + return model_inputs + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + # model_args.config_name if model_args.config_name else model_args.model_name_or_path, + model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + + model = AutoModelForSeq2SeqLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + return config, tokenizer, model + + +def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): + from openpromptu.prompts import GenerationVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id]) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + return template, verbalizer, tokenizer_wrapper + +class Trainer(HfSeq2SeqTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.eval_task = eval_task + self.compute_metrics = self._compute_metrics + + def compute_loss(self, model, inputs, return_outputs=False): + outputs = model(**inputs) + if return_outputs: + return (outputs.loss, outputs) + else: + return outputs.loss + + def prediction_step( + self, + model, #nn.Module, + inputs, #Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only, #: bool, + ignore_keys, #: Optional[List[str]] = None, + ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + gen_kwargs = { + "max_length": 10, # self._max_length if s is not None else self.model.config.max_length, + "num_beams": 1 #self._num_beams if self._num_beams is not None else self.model.config.num_beams, + } + generated_tokens = self.model.generate( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + **gen_kwargs, + ) + # in case the batch is shorter than max length, the output should be padded + if generated_tokens.shape[-1] < gen_kwargs["max_length"]: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + + with torch.no_grad(): + + outputs = model(**inputs) + if has_labels: + if self.label_smoother is not None: + loss = self.label_smoother(outputs, inputs["labels"]).mean().detach() + else: + loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach() + else: + loss = None + + if self.args.prediction_loss_only: + return (loss, None, None) + + labels = inputs["labels"] + if labels.shape[-1] < gen_kwargs["max_length"]: + labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + + # from IPython import embed; embed(header="In seqseqtrainer") + return (loss, generated_tokens, labels) + + def _compute_metrics(self, eval_preds): + # from IPython import embed; embed(header="In compute metrics") + preds, labels = eval_preds + decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) + decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) + # post_processor = .get(data_args.dataset_name[0], tokenizer, + # data_args.ignore_pad_token_for_loss) + # decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info) + result = {} + for metric in self.eval_task.metric: + result.update(metric(decoded_preds, decoded_labels)) + + average_metric = sum(result.values())/len(result) + result.update({"average_metrics":average_metric}) + return result + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/backbones/beit.py b/OpenDelta-0.3.2/examples/examples_prompt/backbones/beit.py new file mode 100644 index 0000000..c35bd4e --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/beit.py @@ -0,0 +1,140 @@ +from openpromptu.data_utils import InputExample +import torch +from transformers.data.data_collator import torch_default_data_collator +from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin +import numpy as np +from transformers import ( + AutoConfig, + AutoFeatureExtractor, + AutoModelForImageClassification, +) + +from transformers import Trainer as HfTrainer +import torch.nn as nn + + +def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): + # from openpromptu.prompts import ManualVerbalizer + # from openpromptu.prompts import ManualTemplate + # from openpromptu import TokenizerWrapper + # template = ManualTemplate(text = task.templates_text[template_id]) + # verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id]) + # tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + return None, None, None + +def preprocess_function(raw_example, **kwargs): + # from IPython import embed; embed(header="Therefa") + tokenizer = kwargs['tokenizer'] + # print(np.array(raw_example['img']).shape) + model_inputs = tokenizer(np.array(raw_example['image']), return_tensors='pt') + model_inputs['pixel_values'] = model_inputs['pixel_values'].squeeze() + model_inputs['labels'] = raw_example['label'] + return model_inputs + +def compute_metrics(eval_preds, dataset_name, eval_metric): + # from IPython import embed; embed(header="In compute metrics") + + preds, labels = eval_preds.predictions, eval_preds.label_ids + + preds = np.argmax(preds, axis=-1) + + result = {} + average_metrics = [] + for metric in eval_metric: + metric_item = metric(preds, labels) + metric_value = list(metric_item.values()) + result.update(metric_item) + average_metrics.extend(metric_value) + print("average:",average_metrics) + average_metric = sum(average_metrics)/len(average_metrics) + result.update({"average_metrics":average_metric}) + return result + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.mask_token + +def get_remove_columns(dataset_features): + # dataset_features.pop("label") + # print("remove_columns: {}".format(dataset_features)) + return dataset_features + +class DataCollator(HfDataCollatorMixin): + def __init__(self, *args, **kwargs): + self.return_tensors='pt' + + def torch_call(self, features): + # from IPython import embed; embed(header="in data collator") + a = torch_default_data_collator(features=features) + # from IPython import embed; embed(header="in data collator") + return a + + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + config.dropout_rate = 0.0 + tokenizer = AutoFeatureExtractor.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + model = AutoModelForImageClassification.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + config.num_labels = model_args.num_classes + old_classifier = model.classifier + model.classifier = nn.Linear(old_classifier.in_features, config.num_labels) + + + return config, tokenizer, model + +class Trainer(HfTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.verbalizer=verbalizer + self.eval_task=eval_task + self.compute_metrics = self._compute_metrics + self.loss_fn = nn.CrossEntropyLoss() + + def compute_loss(self, model, inputs, return_outputs=False): + labels = inputs.pop('labels') + outputs = model(**inputs) + logits = outputs.get("logits") + + loss = self.loss_fn(logits, labels) + return (loss, outputs) if return_outputs else loss + + def _compute_metrics(self, eval_preds): + # from IPython import embed; embed(header="In compute metrics") + + preds, labels = eval_preds.predictions, eval_preds.label_ids + + preds = np.argmax(preds, axis=-1) + + result = {} + average_metrics = [] + for metric in self.eval_task.metric: + metric_item = metric(preds, labels) + metric_value = list(metric_item.values()) + result.update(metric_item) + average_metrics.extend(metric_value) + print("average:",average_metrics) + average_metric = sum(average_metrics)/len(average_metrics) + result.update({"average_metrics":average_metric}) + from IPython import embed; embed(header="In compute metrics") + return result + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/backbones/bert.py b/OpenDelta-0.3.2/examples/examples_prompt/backbones/bert.py new file mode 100644 index 0000000..af92002 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/bert.py @@ -0,0 +1,142 @@ +from openpromptu.data_utils import InputExample +import torch +from transformers.data.data_collator import torch_default_data_collator +from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin +import numpy as np +from transformers import ( + AutoConfig, + AutoModelForMaskedLM, + AutoTokenizer, +) + +from transformers import Trainer as HfTrainer + + +def preprocess_function(raw_example, **kwargs): + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + + example = InputExample(**raw_example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, + padding="max_length", truncation=True) + return model_inputs + +def compute_metrics(eval_preds, dataset_name, eval_metric): + # from IPython import embed; embed(header="In compute metrics") + + preds, labels = eval_preds.predictions, eval_preds.label_ids + + preds = np.argmax(preds, axis=-1) + + result = {} + average_metrics = [] + for metric in eval_metric: + metric_item = metric(preds, labels) + metric_value = list(metric_item.values()) + result.update(metric_item) + average_metrics.extend(metric_value) + print("average:",average_metrics) + average_metric = sum(average_metrics)/len(average_metrics) + result.update({"average_metrics":average_metric}) + return result + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.mask_token + +def get_remove_columns(dataset_features): + dataset_features.remove("label") + return dataset_features + + +def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): + from openpromptu.prompts import ManualVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id]) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + # from IPython import embed; embed() + return template, verbalizer, tokenizer_wrapper + +class DataCollator(HfDataCollatorMixin): + def __init__(self, *args, **kwargs): + self.return_tensors='pt' + + def torch_call(self, features): + return torch_default_data_collator(features=features) + + + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + model = AutoModelForMaskedLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + model.resize_token_embeddings(len(tokenizer)) + return config, tokenizer, model + +class Trainer(HfTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.verbalizer=verbalizer + self.eval_task=eval_task + self.compute_metrics = self._compute_metrics + + + def compute_loss(self, model, inputs, return_outputs=False): + labels = inputs.pop('labels') + outputs = model(**inputs) + logits = outputs.get("logits") + input_ids = inputs['input_ids'] + verbalizer = self.verbalizer.cuda() + logits_at_mask = logits[torch.where(input_ids == verbalizer.tokenizer.mask_token_id)] + label_logits = verbalizer.process_logits(logits_at_mask) + loss_fct = torch.nn.CrossEntropyLoss() + loss = loss_fct(label_logits, labels) + outputs.logits = label_logits + return (loss, outputs) if return_outputs else loss + + def _compute_metrics(self, eval_preds): + # from IPython import embed; embed(header="In compute metrics") + + preds, labels = eval_preds.predictions, eval_preds.label_ids + + preds = np.argmax(preds, axis=-1) + + result = {} + average_metrics = [] + for metric in self.eval_task.metric: + metric_item = metric(preds, labels) + metric_value = list(metric_item.values()) + result.update(metric_item) + average_metrics.extend(metric_value) + print("average:",average_metrics) + average_metric = sum(average_metrics)/len(average_metrics) + result.update({"average_metrics":average_metric}) + return result + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/backbones/bigbird.py b/OpenDelta-0.3.2/examples/examples_prompt/backbones/bigbird.py new file mode 100644 index 0000000..b1dabcb --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/bigbird.py @@ -0,0 +1,143 @@ +from openpromptu.data_utils import InputExample +import torch +from transformers.data.data_collator import torch_default_data_collator +from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin +import numpy as np +from transformers import ( + AutoConfig, + AutoModelForMaskedLM, + AutoTokenizer, +) + +from transformers import Trainer as HfTrainer + + +def preprocess_function(raw_example, **kwargs): + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + + example = InputExample(**raw_example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, + padding="max_length", truncation=True) + return model_inputs + +def compute_metrics(eval_preds, dataset_name, eval_metric): + # from IPython import embed; embed(header="In compute metrics") + + preds, labels = eval_preds.predictions, eval_preds.label_ids + + preds = np.argmax(preds, axis=-1) + + result = {} + average_metrics = [] + for metric in eval_metric: + metric_item = metric(preds, labels) + metric_value = list(metric_item.values()) + result.update(metric_item) + average_metrics.extend(metric_value) + print("average:",average_metrics) + average_metric = sum(average_metrics)/len(average_metrics) + result.update({"average_metrics":average_metric}) + return result + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.mask_token + +def get_remove_columns(dataset_features): + # from IPython import embed; embed(header="get_remove_columns") + dataset_features.remove("label") + return dataset_features + + +def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): + from openpromptu.prompts import ManualVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id]) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + # from IPython import embed; embed() + return template, verbalizer, tokenizer_wrapper + +class DataCollator(HfDataCollatorMixin): + def __init__(self, *args, **kwargs): + self.return_tensors='pt' + + def torch_call(self, features): + return torch_default_data_collator(features=features) + + + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + model = AutoModelForMaskedLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + model.resize_token_embeddings(len(tokenizer)) + return config, tokenizer, model + +class Trainer(HfTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.verbalizer=verbalizer + self.eval_task=eval_task + self.compute_metrics = self._compute_metrics + + + def compute_loss(self, model, inputs, return_outputs=False): + labels = inputs.pop('labels') + outputs = model(**inputs) + logits = outputs.get("logits") + input_ids = inputs['input_ids'] + verbalizer = self.verbalizer.cuda() + logits_at_mask = logits[torch.where(input_ids == verbalizer.tokenizer.mask_token_id)] + label_logits = verbalizer.process_logits(logits_at_mask) + loss_fct = torch.nn.CrossEntropyLoss() + loss = loss_fct(label_logits, labels) + outputs.logits = label_logits + return (loss, outputs) if return_outputs else loss + + def _compute_metrics(self, eval_preds): + # from IPython import embed; embed(header="In compute metrics") + + preds, labels = eval_preds.predictions, eval_preds.label_ids + + preds = np.argmax(preds, axis=-1) + + result = {} + average_metrics = [] + for metric in self.eval_task.metric: + metric_item = metric(preds, labels) + metric_value = list(metric_item.values()) + result.update(metric_item) + average_metrics.extend(metric_value) + print("average:",average_metrics) + average_metric = sum(average_metrics)/len(average_metrics) + result.update({"average_metrics":average_metric}) + return result + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/backbones/bigbird_.py b/OpenDelta-0.3.2/examples/examples_prompt/backbones/bigbird_.py new file mode 100644 index 0000000..8945103 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/bigbird_.py @@ -0,0 +1,169 @@ +from openpromptu.data_utils import InputExample +import torch +from transformers.data.data_collator import torch_default_data_collator +from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin +from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator +import numpy as np +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, +) + +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer +import copy +from torch.nn import CrossEntropyLoss + +def preprocess_function(raw_example, **kwargs): + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + + example = InputExample(**raw_example) + # example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, + padding="max_length", truncation=True) + return model_inputs + + + +def compute_metrics(eval_preds, dataset_name, eval_metric): + pass + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.pad_token + +def get_remove_columns(dataset_features): + # dataset_features.remove("label") + return dataset_features + +def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): + from openpromptu.prompts import GenerationVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + return template, verbalizer, tokenizer_wrapper + + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + # config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + return config, tokenizer, model + +class Trainer(HfSeq2SeqTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.eval_task = eval_task + self.compute_metrics = self._compute_metrics + + def compute_loss(self, model, inputs, return_outputs=False): + + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1)) + + return (loss, outputs) if return_outputs else loss + + def prediction_step( + self, + model, #nn.Module, + inputs, #Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only, #: bool, + ignore_keys, #: Optional[List[str]] = None, + ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + inputs = self._prepare_inputs(inputs) + with torch.no_grad(): + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous().long() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu() + loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss) + + if prediction_loss_only: + return (loss, None, None) + else: + # non pad label + shift_labels = shift_labels.view(-1).detach().cpu() + nonpad_idx = shift_labels!=self.tokenizer.pad_token_id + shift_labels = shift_labels[nonpad_idx] + # the probability at the corresponding position + shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu() + target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device) + shift_logits = shift_logits.softmax(dim=-1)[target_position] + + + return (loss, shift_logits, shift_labels) + + def _compute_metrics(self, eval_preds): + + preds, labels = eval_preds + + result = {} + for metric in self.eval_task.metric: + result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id)) + + average_metric = sum(result.values())/len(result) + result.update({"average_metrics":average_metric}) + return result \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/backbones/blenderbot.py b/OpenDelta-0.3.2/examples/examples_prompt/backbones/blenderbot.py new file mode 100644 index 0000000..54e4ec8 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/blenderbot.py @@ -0,0 +1,181 @@ + +from openpromptu.data_utils import InputExample +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer +from transformers import ( + AutoConfig, + BlenderbotForConditionalGeneration, + AutoTokenizer, +) +from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator +import torch + +def mask_token_func(tokenizer, ith_mask=0): + return "" + +def get_remove_columns(dataset_features): + return dataset_features + +def preprocess_function(raw_example, **kwargs): + # max_target_length += 1 + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + split = kwargs['split'] + example = InputExample(**raw_example) + + + + example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, + padding="max_length", truncation=True) + + + with tokenizer.as_target_tokenizer(): + label = tokenizer(other['tgt_text']).input_ids + + model_inputs["labels"] = label + # from IPython import embed; embed() + return model_inputs + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + + model = BlenderbotForConditionalGeneration.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + # from IPython import embed; embed() + return config, tokenizer, model + + +def get_prompts(task, tokenizer, data_args, template_id="blenderbot", verbalizer_id="blenderbot"): + from openpromptu.prompts import GenerationVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id]) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + return template, verbalizer, tokenizer_wrapper + +class Trainer(HfSeq2SeqTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.eval_task = eval_task + self.compute_metrics = self._compute_metrics + + def compute_loss(self, model, inputs, return_outputs=False): + # from IPython import embed; embed() + outputs = model(**inputs) + if return_outputs: + return (outputs.loss, outputs) + else: + return outputs.loss + + def prediction_step( + self, + model, #nn.Module, + inputs, #Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only, #: bool, + ignore_keys, #: Optional[List[str]] = None, + ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + gen_kwargs = { + "max_length": 10, # self._max_length if s is not None else self.model.config.max_length, + "num_beams": 1, #self._num_beams if self._num_beams is not None else self.model.config.num_beams, + "min_length": 1 # for blenderbot, generally we set it to be a large number. But in classification, we set it to 1 + } + generated_tokens = self.model.generate( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + **gen_kwargs, + ) + # in case the batch is shorter than max length, the output should be padded + if generated_tokens.shape[-1] < gen_kwargs["max_length"]: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + + with torch.no_grad(): + + outputs = model(**inputs) + if has_labels: + if self.label_smoother is not None: + loss = self.label_smoother(outputs, inputs["labels"]).mean().detach() + else: + loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach() + else: + loss = None + + if self.args.prediction_loss_only: + return (loss, None, None) + + labels = inputs["labels"] + if labels.shape[-1] < gen_kwargs["max_length"]: + labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + + # from IPython import embed; embed(header="In seqseqtrainer") + return (loss, generated_tokens, labels) + + def _compute_metrics(self, eval_preds): + # from IPython import embed; embed(header="In compute metrics") + preds, labels = eval_preds + decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) + decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) + # post_processor = .get(data_args.dataset_name[0], tokenizer, + # data_args.ignore_pad_token_for_loss) + # decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info) + result = {} + for metric in self.eval_task.metric: + result.update(metric(decoded_preds, decoded_labels)) + + average_metric = sum(result.values())/len(result) + result.update({"average_metrics":average_metric}) + return result + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/backbones/clip.py b/OpenDelta-0.3.2/examples/examples_prompt/backbones/clip.py new file mode 100644 index 0000000..4889b97 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/clip.py @@ -0,0 +1,172 @@ +from openpromptu.data_utils import InputExample +import torch +from transformers.data.data_collator import torch_default_data_collator +from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin +import numpy as np +from transformers import ( + CLIPConfig, + CLIPProcessor, + CLIPModel, +) +from transformers import ViTFeatureExtractor +from PIL import Image +from transformers import Trainer as HfTrainer +import torch.nn as nn + + + +def get_prompts(task, tokenizer, data_args, template_id="clip", verbalizer_id="clip"): + from openpromptu.prompts import GenerationVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id]) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer.tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + return template, verbalizer, tokenizer_wrapper + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.mask_token + +def preprocess_function(raw_example, **kwargs): + # from IPython import embed; embed(header="Therefa") + tokenizer = kwargs['tokenizer'] + + # ["a photo of {}" for i in range()] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + + example = InputExample(raw_example) + + texts = [] + + for candidate_label in range(verbalizer.num_classes): + tgt_text = verbalizer.wrap_one_example(label=candidate_label) + wrapped_example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(wrapped_example, tgt_texts=[tgt_text]) + texts.append(input_sentence) + + # from IPython import embed; embed()/ + + image = Image.open(raw_example['image_file_path']) + + model_inputs = tokenizer(images=image, text=texts, max_length=16, padding="max_length", truncation=True, return_tensors='pt') + + # from IPython import embed; embed() + model_inputs["pixel_values"] = model_inputs["pixel_values"].squeeze() + model_inputs["label"] = example.label + return model_inputs + +def compute_metrics(eval_preds, dataset_name, eval_metric): + # from IPython import embed; embed(header="In compute metrics") + + preds, labels = eval_preds.predictions, eval_preds.label_ids + + preds = np.argmax(preds, axis=-1) + + result = {} + average_metrics = [] + for metric in eval_metric: + metric_item = metric(preds, labels) + metric_value = list(metric_item.values()) + result.update(metric_item) + average_metrics.extend(metric_value) + print("average:",average_metrics) + average_metric = sum(average_metrics)/len(average_metrics) + result.update({"average_metrics":average_metric}) + return result + + + +def get_remove_columns(dataset_features): + # from IPython import embed; embed(header="in remoev") + dataset_features.remove("labels") + print("remove_columns: {}".format(dataset_features)) + return dataset_features + +class DataCollator(HfDataCollatorMixin): + def __init__(self, *args, **kwargs): + self.return_tensors='pt' + + def torch_call(self, features): + # from IPython import embed; embed(header="in data collator") + a = torch_default_data_collator(features=features) + # from IPython import embed; embed(header="in data collator") + a["input_ids"] = a["input_ids"][0] + a["attention_mask"] = a["attention_mask"][0] + return a + + +def get_backbone(model_args, **kwargs): + config = CLIPConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + # config.dropout_rate = 0.0 + tokenizer = CLIPProcessor.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + model = CLIPModel.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + # config.num_labels = model_args.num_classes + # old_classifier = model.classifier + # model.classifier = nn.Linear(old_classifier.in_features, config.num_labels) + + + return config, tokenizer, model + +class Trainer(HfTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.verbalizer=verbalizer + self.eval_task=eval_task + self.compute_metrics = self._compute_metrics + self.loss_fn = nn.CrossEntropyLoss() + + def compute_loss(self, model, inputs, return_outputs=False): + # from IPython import embed; embed() + labels = inputs.pop('labels') + outputs = model(**inputs) + # logits = outputs.get("logits") + + + logits_per_image = outputs.logits_per_image + loss = self.loss_fn(logits_per_image, labels) + return (loss, outputs) if return_outputs else loss + + def _compute_metrics(self, eval_preds): + # from IPython import embed; embed(header="In compute metrics") + + preds, labels = eval_preds.predictions, eval_preds.label_ids + + preds = np.argmax(preds, axis=-1) + + result = {} + average_metrics = [] + for metric in self.eval_task.metric: + metric_item = metric(preds, labels) + metric_value = list(metric_item.values()) + result.update(metric_item) + average_metrics.extend(metric_value) + print("average:",average_metrics) + average_metric = sum(average_metrics)/len(average_metrics) + result.update({"average_metrics":average_metric}) + from IPython import embed; embed(header="In compute metrics") + return result + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/backbones/opt.py b/OpenDelta-0.3.2/examples/examples_prompt/backbones/opt.py new file mode 100644 index 0000000..5902bc9 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/opt.py @@ -0,0 +1,171 @@ +from openpromptu.data_utils import InputExample +import torch +from transformers.data.data_collator import torch_default_data_collator +from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin +from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator +import numpy as np +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, +) + +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer +import copy +from torch.nn import CrossEntropyLoss + +def preprocess_function(raw_example, **kwargs): + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + + example = InputExample(**raw_example) + # example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, + padding="max_length", truncation=True) + return model_inputs + + + +def compute_metrics(eval_preds, dataset_name, eval_metric): + pass + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.pad_token + +def get_remove_columns(dataset_features): + # dataset_features.remove("label") + return dataset_features + +def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): + from openpromptu.prompts import GenerationVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="tail", mask_token_func=mask_token_func) + return template, verbalizer, tokenizer_wrapper + + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + # config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + if not hasattr(tokenizer,"pad_token") or (hasattr(tokenizer,"pad_token") and tokenizer.pad_token==None): + tokenizer.pad_token = tokenizer.eos_token + + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + return config, tokenizer, model + +class Trainer(HfSeq2SeqTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.eval_task = eval_task + self.compute_metrics = self._compute_metrics + + def compute_loss(self, model, inputs, return_outputs=False): + + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1)) + + return (loss, outputs) if return_outputs else loss + + def prediction_step( + self, + model, #nn.Module, + inputs, #Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only, #: bool, + ignore_keys, #: Optional[List[str]] = None, + ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + inputs = self._prepare_inputs(inputs) + with torch.no_grad(): + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous().long() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu() + loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss) + + if prediction_loss_only: + return (loss, None, None) + else: + # non pad label + shift_labels = shift_labels.view(-1).detach().cpu() + nonpad_idx = shift_labels!=self.tokenizer.pad_token_id + shift_labels = shift_labels[nonpad_idx] + # the probability at the corresponding position + shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu() + target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device) + shift_logits = shift_logits.softmax(dim=-1)[target_position] + + + return (loss, shift_logits, shift_labels) + + def _compute_metrics(self, eval_preds): + + preds, labels = eval_preds + + result = {} + for metric in self.eval_task.metric: + result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id)) + + average_metric = sum(result.values())/len(result) + result.update({"average_metrics":average_metric}) + return result \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/backbones/t5.py b/OpenDelta-0.3.2/examples/examples_prompt/backbones/t5.py new file mode 100644 index 0000000..15e7f21 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/t5.py @@ -0,0 +1,177 @@ + +from openpromptu.data_utils import InputExample +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer +from transformers import ( + AutoConfig, + AutoModelForSeq2SeqLM, + AutoTokenizer, +) +from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator +import torch + +def mask_token_func(tokenizer, ith_mask): + return tokenizer.additional_special_tokens[ith_mask] + +def get_remove_columns(dataset_features): + return dataset_features + +def preprocess_function(raw_example, **kwargs): + # max_target_length += 1 + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + split = kwargs['split'] + example = InputExample(**raw_example) + + + + example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=256, + padding="max_length", truncation=True) + + + with tokenizer.as_target_tokenizer(): + label = tokenizer(other['tgt_text']).input_ids + + model_inputs["labels"] = label + return model_inputs + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + + model = AutoModelForSeq2SeqLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + return config, tokenizer, model + + +def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): + from openpromptu.prompts import GenerationVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id]) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + return template, verbalizer, tokenizer_wrapper + +class Trainer(HfSeq2SeqTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.eval_task = eval_task + self.compute_metrics = self._compute_metrics + + def compute_loss(self, model, inputs, return_outputs=False): + outputs = model(**inputs) + if return_outputs: + return (outputs.loss, outputs) + else: + return outputs.loss + + def prediction_step( + self, + model, #nn.Module, + inputs, #Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only, #: bool, + ignore_keys, #: Optional[List[str]] = None, + ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + gen_kwargs = { + "max_length": 10, # self._max_length if s is not None else self.model.config.max_length, + "num_beams": 1 #self._num_beams if self._num_beams is not None else self.model.config.num_beams, + } + generated_tokens = self.model.generate( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + **gen_kwargs, + ) + # in case the batch is shorter than max length, the output should be padded + if generated_tokens.shape[-1] < gen_kwargs["max_length"]: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + + with torch.no_grad(): + + outputs = model(**inputs) + if has_labels: + if self.label_smoother is not None: + loss = self.label_smoother(outputs, inputs["labels"]).mean().detach() + else: + loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach() + else: + loss = None + + if self.args.prediction_loss_only: + return (loss, None, None) + + labels = inputs["labels"] + if labels.shape[-1] < gen_kwargs["max_length"]: + labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + + # from IPython import embed; embed(header="In seqseqtrainer") + return (loss, generated_tokens, labels) + + def _compute_metrics(self, eval_preds): + # from IPython import embed; embed(header="In compute metrics") + preds, labels = eval_preds + decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) + decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) + # post_processor = .get(data_args.dataset_name[0], tokenizer, + # data_args.ignore_pad_token_for_loss) + # decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info) + result = {} + for metric in self.eval_task.metric: + result.update(metric(decoded_preds, decoded_labels)) + + average_metric = sum(result.values())/len(result) + result.update({"average_metrics":average_metric}) + return result + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/backbones/vit.py b/OpenDelta-0.3.2/examples/examples_prompt/backbones/vit.py new file mode 100644 index 0000000..e69de29 diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/adapter_clip-vit-base-patch32/beans.json b/OpenDelta-0.3.2/examples/examples_prompt/configs/adapter_clip-vit-base-patch32/beans.json new file mode 100644 index 0000000..5f46495 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/adapter_clip-vit-base-patch32/beans.json @@ -0,0 +1,48 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "beans", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32", + "num_classes": 3, + "num_train_epochs": 20, + "output_dir": "outputs/adapter/clip-vit-base-patch32/beans", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_delta_center": true, + "push_to_hub": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "beans", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "beans", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/adapter_opt-350m/wikitext.json b/OpenDelta-0.3.2/examples/examples_prompt/configs/adapter_opt-350m/wikitext.json new file mode 100644 index 0000000..af141ff --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/adapter_opt-350m/wikitext.json @@ -0,0 +1,53 @@ +{ + "backbone_model": "opt", + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "wikitext", + "eval_steps": 200, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":2, + "greater_is_better": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 900, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m", + "model_path_public": "opt-350m", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/opt-350m/wikitext", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 6, + "per_device_train_batch_size": 6, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "wikitext", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "wikitext", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["self_attn"] +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/adapter_vit-large-patch16-224-in21k/beans.json b/OpenDelta-0.3.2/examples/examples_prompt/configs/adapter_vit-large-patch16-224-in21k/beans.json new file mode 100644 index 0000000..ff7551a --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/adapter_vit-large-patch16-224-in21k/beans.json @@ -0,0 +1,53 @@ +{ + "backbone_model": "vit", + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": false, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "beans", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k", + "model_path_public": "vit-large-patch16-224-in21k", + "num_classes": 3, + "num_train_epochs": 20, + "output_dir": "outputs/adapter/vit-large-patch16-224-in21k/beans", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "beans", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "beans", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["output"] +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/bitfit_t5-large/rte.json b/OpenDelta-0.3.2/examples/examples_prompt/configs/bitfit_t5-large/rte.json new file mode 100644 index 0000000..04e7f77 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/bitfit_t5-large/rte.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "t5-large", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/t5-large", + "model_path_public": "t5-large", + "num_train_epochs": 20, + "output_dir": "outputs/bitfit/t5-large/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/t5-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attn", "ff", "layer_norm"] +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/compacter_blenderbot-3b/sst2.json b/OpenDelta-0.3.2/examples/examples_prompt/configs/compacter_blenderbot-3b/sst2.json new file mode 100644 index 0000000..2862f6e --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/compacter_blenderbot-3b/sst2.json @@ -0,0 +1,66 @@ +{ + "backbone_model": "blenderbot", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "compacter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "sst2", + "eval_steps": 200, + "evaluation_strategy": "steps", + "factorized_phm": true, + "factorized_phm_rule": false, + "gradient_clip": false, + "greater_is_better": true, + "hypercomplex_adapters": true, + "hypercomplex_division": 4, + "hypercomplex_nonlinearity": "glorot-uniform", + "learn_phm": true, + "learning_rate": 0.003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b", + "model_path_public": "blenderbot-3b", + "non_linearity": "gelu_new", + "normalize_phm_weight": false, + "num_train_epochs": 3, + "output_dir": "outputs/compacter/blenderbot-3b/sst2", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "phm_c_init": "normal", + "phm_clamp": false, + "phm_init_range": 0.0001, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "shared_phm_rule": false, + "split_validation_test": true, + "task_name": "sst2", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "sst2", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "use_bias_down_sampler": true, + "use_bias_up_sampler": true, + "warmup_steps": 0, + "modified_modules":["fc2"] +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/compacter_deberta-v2-xlarge/mnli.json b/OpenDelta-0.3.2/examples/examples_prompt/configs/compacter_deberta-v2-xlarge/mnli.json new file mode 100644 index 0000000..23c38d7 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/compacter_deberta-v2-xlarge/mnli.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "deberta-v2-xlarge", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "compacter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mnli", + "eval_steps": 500, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge", + "num_train_epochs": 3, + "output_dir": "outputs/compacter/deberta-v2-xlarge/mnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hub": false, + "save_steps": 500, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "mnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mnli", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attention"] +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/compacter_long-t5-tglobal-large/rte.json b/OpenDelta-0.3.2/examples/examples_prompt/configs/compacter_long-t5-tglobal-large/rte.json new file mode 100644 index 0000000..eb3d7c1 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/compacter_long-t5-tglobal-large/rte.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "long-t5", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "compacter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large", + "model_path_public": "long-t5-tglobal-large", + "num_train_epochs": 20, + "output_dir": "outputs/compacter/long-t5-tglobal-large/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attn", "ff", "layer_norm"] +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen.py b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen.py new file mode 100644 index 0000000..2c751a3 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen.py @@ -0,0 +1,51 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +import argparse +import json +import os +parser = argparse.ArgumentParser("Parser to generate configuration") +parser.add_argument("--job", type=str) +parser.add_argument("--") +args = parser.parse_args() + + +if __name__ == "__main__": + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_albert.py b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_albert.py new file mode 100644 index 0000000..be9af6d --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_albert.py @@ -0,0 +1,116 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} + + +#### ROBERTA###### +BaseConfigs['albert-xlarge-v2'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}albert-xlarge-v2", + "tokenizer_name": f"{PATHBASE}albert-xlarge-v2", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['prefix_albert-xlarge-v2'] = copy.deepcopy(BaseConfigs['albert-xlarge-v2']) +AllConfigs['prefix_albert-xlarge-v2'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/albert-xlarge-v2/", + }) + +AllConfigs['soft_prompt_albert-xlarge-v2'] = copy.deepcopy(BaseConfigs['albert-xlarge-v2']) +AllConfigs['soft_prompt_albert-xlarge-v2'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/albert-xlarge-v2/", + }) + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_bart.py b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_bart.py new file mode 100644 index 0000000..ec5a2f0 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_bart.py @@ -0,0 +1,261 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" +# PATHBASE="" + +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['bart-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}bart-base", + "tokenizer_name": f"{PATHBASE}bart-base", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['bitfit_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['bitfit_bart-base'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/bart-base/", + }) + +AllConfigs['adapter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['adapter_bart-base'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/bart-base/", + }) + +AllConfigs['parallel_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['parallel_adapter_t5-base'].update({ + "delta_type": "parallel_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/parallel_adapter/t5-base/", + }) + +AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['lora_t5-base'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "modified_modules": [ + "q_proj", + "v_proj", + ], + "lora_r": 8, + "output_dir": "outputs/lora/bart-base/", + }) + +AllConfigs['compacter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['compacter_bart-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/bart-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['compacter++_bart-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/bart-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['low_rank_adapter_bart-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/bart-base/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['soft_prompt_bart-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/bart-base/", + }) + +AllConfigs['prefix_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['prefix_bart-base'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/bart-base/", + }) + +AllConfigs['soft_prompt_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['soft_prompt_bart-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/bart-base/", + }) + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_beit.py b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_beit.py new file mode 100644 index 0000000..15550ef --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_beit.py @@ -0,0 +1,250 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['beit-base-patch16-224'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "num_classes"): zip( + ["beans"], + ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20], + [256], + [ 32], + [ 32],#, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0], # *7 +[0] *8, + [200],# 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200],#, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [ 3], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}beit-base-patch16-224", + "tokenizer_name": f"{PATHBASE}beit-base-patch16-224", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps", + "datasets_load_from_disk":False, + } + +AllConfigs['bitfit_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['bitfit_beit-base-patch16-224'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/beit-base-patch16-224/", + }) + +AllConfigs['adapter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['adapter_beit-base-patch16-224'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/beit-base-patch16-224/", + }) + +AllConfigs['lora_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['lora_beit-base-patch16-224'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layernorm_after", + "classifier" + ], + "modified_modules":[ + "query", + "value", + ], + "lora_r": 8, + "output_dir": "outputs/lora/beit-base-patch16-224/", + }) + +AllConfigs['compacter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['compacter_beit-base-patch16-224'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/beit-base-patch16-224/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['compacter++_beit-base-patch16-224'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/beit-base-patch16-224/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['low_rank_adapter_beit-base-patch16-224'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/beit-base-patch16-224/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['soft_prompt_beit-base-patch16-224'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/beit-base-patch16-224/", + }) + +AllConfigs['prefix_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['prefix_beit-base-patch16-224'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/beit-base-patch16-224/", + }) + +AllConfigs['soft_prompt_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['soft_prompt_beit-base-patch16-224'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/beit-base-patch16-224/", + }) + + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_bert.py b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_bert.py new file mode 100644 index 0000000..a56f4a0 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_bert.py @@ -0,0 +1,125 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +# PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} + + +#### ROBERTA###### +BaseConfigs['bert-base-cased'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}bert-base-cased", + "tokenizer_name": f"{PATHBASE}bert-base-cased", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps", + "datasets_load_from_disk": True, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/" + } + +AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) +AllConfigs['prefix_bert-base-cased'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/bert-base-cased/", + }) + +AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) +AllConfigs['soft_prompt_bert-base-cased'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/bert-base-cased/", + }) + +AllConfigs['prefix_bert-large-cased'] = copy.deepcopy(AllConfigs['prefix_bert-base-cased']) +AllConfigs['prefix_bert-large-cased'].update({ + "output_dir": "outputs/prefix/bert-large-cased/", + "model_name_or_path": f"{PATHBASE}bert-large-cased", + "tokenizer_name": f"{PATHBASE}bert-large-cased", +}) + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_bigbird.py b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_bigbird.py new file mode 100644 index 0000000..b5a41e0 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_bigbird.py @@ -0,0 +1,147 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} + +#### ROBERTA ###### +BaseConfigs['bigbird-roberta-large'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}bigbird-roberta-large", + "tokenizer_name": f"{PATHBASE}bigbird-roberta-large", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + + + +AllConfigs['bitfit_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large']) +AllConfigs['bitfit_bigbird-roberta-large'].update({ + "delta_type": "bitfit", + "learning_rate": 1e-3, + "output_dir": "outputs/bitfit/bigbird-roberta-large/", + }) + +AllConfigs['none_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large']) +AllConfigs['none_bigbird-roberta-large'].update({ + "delta_type": "none", + "learning_rate": 1e-5, + "output_dir": "outputs/none/bigbird-roberta-large/", + }) + + +AllConfigs['lora_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large']) +AllConfigs['lora_bigbird-roberta-large'].update({ + "delta_type": "lora", + "learning_rate": 1e-3, + "modified_modules": [ + "query", + "key", + ], + "output_dir": "outputs/lora/bigbird-roberta-large/", + }) + +AllConfigs['adapter_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large']) +AllConfigs['adapter_bigbird-roberta-large'].update({ + "delta_type": "adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/adapter/bigbird-roberta-large/", + }) + +AllConfigs['low_rank_adapter_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large']) +AllConfigs['low_rank_adapter_bigbird-roberta-large'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/low_rank_adapter/bigbird-roberta-large/", + }) + + +AllConfigs['soft_prompt_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large']) +AllConfigs['soft_prompt_bigbird-roberta-large'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/bigbird-roberta-large/", + }) + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_blenderbot.py b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_blenderbot.py new file mode 100644 index 0000000..c0f9653 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_blenderbot.py @@ -0,0 +1,254 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['blenderbot-400M-distill'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}blenderbot-400M-distill", + "tokenizer_name": f"{PATHBASE}blenderbot-400M-distill", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['bitfit_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['bitfit_blenderbot-400M-distill'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/blenderbot-400M-distill/", + }) + +AllConfigs['adapter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['adapter_blenderbot-400M-distill'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/blenderbot-400M-distill/", + }) + +AllConfigs['lora_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['lora_blenderbot-400M-distill'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "modified_modules":[ + "q_proj", + "v_proj", + ], + "lora_r": 8, + "output_dir": "outputs/lora/blenderbot-400M-distill/", + }) + +AllConfigs['compacter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['compacter_blenderbot-400M-distill'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/blenderbot-400M-distill/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['compacter++_blenderbot-400M-distill'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/blenderbot-400M-distill/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['low_rank_adapter_blenderbot-400M-distill'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/blenderbot-400M-distill/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + +AllConfigs['none_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['none_blenderbot-400M-distill'].update({ + "delta_type": "none", + "learning_rate": 1e-5, + "output_dir": "outputs/none/blenderbot-400M-distill/", + }) + + +AllConfigs['soft_prompt_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['soft_prompt_blenderbot-400M-distill'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/blenderbot-400M-distill/", + }) + +AllConfigs['prefix_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['prefix_blenderbot-400M-distill'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/blenderbot-400M-distill/", + }) + +AllConfigs['soft_prompt_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['soft_prompt_blenderbot-400M-distill'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/blenderbot-400M-distill/", + }) + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_clip.py b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_clip.py new file mode 100644 index 0000000..41a59c5 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_clip.py @@ -0,0 +1,303 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +# PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['clip-vit-base-patch32'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "num_classes"): zip( + ["beans"], + ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20], + [256], + [ 32], + [ 32],#, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0], # *7 +[0] *8, + [200],# 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200],#, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [ 3], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}clip-vit-base-patch32", + "tokenizer_name": f"{PATHBASE}clip-vit-base-patch32", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['bitfit_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['bitfit_clip-vit-base-patch32'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/clip-vit-base-patch32/", + }) + +AllConfigs['none_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['none_clip-vit-base-patch32'].update({ + "delta_type": "none", + "learning_rate": 1e-5, + "output_dir": "outputs/none/clip-vit-base-patch32/", + }) + +AllConfigs['adapter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['adapter_clip-vit-base-patch32'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/clip-vit-base-patch32/", + }) + +AllConfigs['lora_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['lora_clip-vit-base-patch32'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/clip-vit-base-patch32/", + }) + +AllConfigs['compacter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['compacter_clip-vit-base-patch32'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/clip-vit-base-patch32/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['compacter++_clip-vit-base-patch32'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/clip-vit-base-patch32/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['low_rank_adapter_clip-vit-base-patch32'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/clip-vit-base-patch32/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['soft_prompt_clip-vit-base-patch32'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/clip-vit-base-patch32/", + }) + +AllConfigs['prefix_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['prefix_clip-vit-base-patch32'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/clip-vit-base-patch32/", + }) + +AllConfigs['soft_prompt_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['soft_prompt_clip-vit-base-patch32'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/clip-vit-base-patch32/", + }) +#### clip-vit-base-patch32 +BaseConfigs['t5-small'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}t5-small", + "tokenizer_name": f"{PATHBASE}t5-small", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small']) +AllConfigs['prefix_t5-small'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-small/", + }) + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_gpt.py b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_gpt.py new file mode 100644 index 0000000..d33e355 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_gpt.py @@ -0,0 +1,433 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['t5-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}t5-base", + "tokenizer_name": f"{PATHBASE}t5-base", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['bitfit_t5-base'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-base/", + }) + +AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['adapter_t5-base'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-base/", + }) + +AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['lora_t5-base'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-base/", + }) + +AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter++_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['low_rank_adapter_t5-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/t5-base/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['soft_prompt_t5-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/t5-base/", + }) + +AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['prefix_t5-base'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-base/", + }) + +#### T5-base +BaseConfigs['t5-small'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}t5-small", + "tokenizer_name": f"{PATHBASE}t5-small", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small']) +AllConfigs['prefix_t5-small'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-small/", + }) + + + + +#### ROBERTA###### +BaseConfigs['roberta-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}roberta-base", + "tokenizer_name": f"{PATHBASE}roberta-base", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + + + +AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['bitfit_roberta-base'].update({ + "delta_type": "bitfit", + "learning_rate": 1e-3, + "output_dir": "outputs/bitfit/roberta-base/", + }) + +AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['none_roberta-base'].update({ + "delta_type": "none", + "learning_rate": 1e-5, + "output_dir": "outputs/none/roberta-base/", + }) + + +AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['lora_roberta-base'].update({ + "delta_type": "lora", + "learning_rate": 1e-3, + "output_dir": "outputs/lora/roberta-base/", + }) + +AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['adapter_roberta-base'].update({ + "delta_type": "adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/adapter/roberta-base/", + }) + +AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['low_rank_adapter_roberta-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/low_rank_adapter/roberta-base/", + }) + +#### ROBERTA###### +BaseConfigs['bert-base-cased'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}bert-base-cased", + "tokenizer_name": f"{PATHBASE}bert-base-cased", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) +AllConfigs['prefix_bert-base-cased'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/bert-base-cased/", + }) + +AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) +AllConfigs['soft_prompt_bert-base-cased'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/bert-base-cased/", + }) + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_roberta.py b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_roberta.py new file mode 100644 index 0000000..c16bdda --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_roberta.py @@ -0,0 +1,163 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +# PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} + +#### ROBERTA###### +BaseConfigs['roberta-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}roberta-base", + "tokenizer_name": f"{PATHBASE}roberta-base", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps", + "datasets_load_from_disk": True, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/" + } + + + +AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['bitfit_roberta-base'].update({ + "delta_type": "bitfit", + "learning_rate": 1e-3, + "output_dir": "outputs/bitfit/roberta-base/", + }) + +AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['none_roberta-base'].update({ + "delta_type": "none", + "learning_rate": 1e-5, + "output_dir": "outputs/none/roberta-base/", + }) + + +AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['lora_roberta-base'].update({ + "delta_type": "lora", + "learning_rate": 1e-3, + "output_dir": "outputs/lora/roberta-base/", + }) + +AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['adapter_roberta-base'].update({ + "delta_type": "adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/adapter/roberta-base/", + }) + +AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['low_rank_adapter_roberta-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/low_rank_adapter/roberta-base/", + }) + + +AllConfigs['soft_prompt_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['soft_prompt_roberta-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/roberta-base/", + }) + +AllConfigs['prefix_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['prefix_roberta-base'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/roberta-base/", + }) + + +AllConfigs['prefix_roberta-large'] = copy.deepcopy(AllConfigs['prefix_roberta-base']) +AllConfigs['prefix_roberta-large'].update({ + "output_dir": "outputs/prefix/prefix_roberta-large", + "model_name_or_path": f"{PATHBASE}prefix_roberta-large", + "tokenizer_name": f"{PATHBASE}prefix_roberta-large", +}) + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_t5.py b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_t5.py new file mode 100644 index 0000000..7040fb6 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_t5.py @@ -0,0 +1,300 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +# PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['t5-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}t5-base", + "tokenizer_name": f"{PATHBASE}t5-base", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hf": False, + "push_to_dc": True, + "save_strategy": "steps", + "datasets_load_from_disk": True, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "backbone_model": "t5", # use in delta center, + "model_path_public": "t5-base", # use in delta center, + + } + +AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['bitfit_t5-base'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-base/", + }) + +AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['adapter_t5-base'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-base/", + }) + +AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['lora_t5-base'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-base/", + }) + +AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter++_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['low_rank_adapter_t5-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/t5-base/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['soft_prompt_t5-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/t5-base/", + }) + +AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['prefix_t5-base'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "reparameterize": False, + "output_dir": "outputs/prefix/t5-base/", + }) + +AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['soft_prompt_t5-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/t5-base/", + }) +#### T5-base +BaseConfigs['t5-small'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}t5-small", + "tokenizer_name": f"{PATHBASE}t5-small", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small']) +AllConfigs['prefix_t5-small'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-small/", + }) + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_beit-large-patch16-224/cifar10.json b/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_beit-large-patch16-224/cifar10.json new file mode 100644 index 0000000..1a4d789 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_beit-large-patch16-224/cifar10.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "beit", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "cifar10", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224", + "model_path_public": "beit-large-patch16-224", + "num_classes": 10, + "num_train_epochs": 20, + "output_dir": "outputs/lora/beit-large-patch16-224/cifar10", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "cifar10", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "cifar10", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["query","value"] +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_gpt-j-6B/wikitext.json b/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_gpt-j-6B/wikitext.json new file mode 100644 index 0000000..11ebfde --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_gpt-j-6B/wikitext.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "gpt-j", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "wikitext", + "eval_steps": 500, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":4, + "greater_is_better": false, + "learning_rate": 0.00003, + "load_best_model_at_end": true, + "max_source_length": 512, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B", + "model_path_public": "gpt-j-6B", + "num_train_epochs": 2, + "output_dir": "outputs/lora/gpt-j-6B/wikitext", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 2, + "per_device_train_batch_size": 2, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 500, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "wikitext", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "wikitext", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["20.attn.q_proj","21.attn.q_proj","22.attn.q_proj","23.attn.q_proj","24.attn.q_proj","25.attn.q_proj","26.attn.q_proj","27.attn.q_proj"] +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_roberta-large/superglue-boolq.json b/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_roberta-large/superglue-boolq.json new file mode 100644 index 0000000..9ef9cff --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_roberta-large/superglue-boolq.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "roberta-large", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0001, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large", + "model_path_public": "roberta-large", + "num_train_epochs": 20, + "output_dir": "outputs/lora/roberta-large/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_hub": false, + "push_to_dc": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["query","value"] +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_xlm-roberta-large/superglue-wic.json b/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_xlm-roberta-large/superglue-wic.json new file mode 100644 index 0000000..35a42f1 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_xlm-roberta-large/superglue-wic.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "xlm-roberta-large", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wic", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large", + "model_path_public": "xlm-roberta-large", + "num_train_epochs": 20, + "output_dir": "outputs/lora/xlm-roberta-large/superglue-wic", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hub": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "superglue-wic", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wic", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["query","value"] +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/low_rank_adapter_gpt2/wikitext.json b/OpenDelta-0.3.2/examples/examples_prompt/configs/low_rank_adapter_gpt2/wikitext.json new file mode 100644 index 0000000..3a60852 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/low_rank_adapter_gpt2/wikitext.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "gpt2", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "low_rank_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "wikitext", + "eval_steps": 200, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":1, + "greater_is_better": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 768, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt2", + "model_path_public": "gpt2", + "num_train_epochs": 2, + "output_dir": "outputs/low_rank_adapter/gpt2/wikitext", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "wikitext", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "wikitext", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt2", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attn","mlp"] +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/prefix_bert-large-cased/rte.json b/OpenDelta-0.3.2/examples/examples_prompt/configs/prefix_bert-large-cased/rte.json new file mode 100644 index 0000000..5d67563 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/prefix_bert-large-cased/rte.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "bert-large-cased", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/bert-large-cased/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hub": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attention"] +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/configs/soft_prompt_bart-large/superglue-boolq.json b/OpenDelta-0.3.2/examples/examples_prompt/configs/soft_prompt_bart-large/superglue-boolq.json new file mode 100644 index 0000000..19cbbba --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/soft_prompt_bart-large/superglue-boolq.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "bart", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "soft_prompt", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 500, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":1, + "greater_is_better": true, + "learning_rate": 0.1, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bart-large", + "model_path_public": "bart-large", + "num_train_epochs": 50, + "output_dir": "outputs/soft_prompt/bart-large/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 500, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "soft_token_num":100, + "split_validation_test": true, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bart-large", + "token_init": true, + "unfrozen_modules": [ + "deltas" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/data_processors/__init__.py b/OpenDelta-0.3.2/examples/examples_prompt/data_processors/__init__.py new file mode 100644 index 0000000..7d2b503 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/data_processors/__init__.py @@ -0,0 +1,3 @@ +from .tasks import TASK_MAPPING, AutoTask +# from .data_collator import TaskDataCollatorForSeq2Seq +# from .postprocessors import AutoPostProcessor diff --git a/OpenDelta-0.3.2/examples/examples_prompt/data_processors/processor.py b/OpenDelta-0.3.2/examples/examples_prompt/data_processors/processor.py new file mode 100644 index 0000000..9986100 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/data_processors/processor.py @@ -0,0 +1,102 @@ +import abc +from typing import Callable, List, Mapping, Dict +import datasets +import logging +import numpy as np +import torch +logger = logging.getLogger(__name__) + + +class AbstractTask(abc.ABC): + name = NotImplemented + config = NotImplemented + prefix = NotImplemented + metric = NotImplemented + metric_names = NotImplemented + split_map = None + labels_list = None + split_to_data_split: Mapping[str, str] = \ + {"train": "train", "validation": "validation", "test": "test"} + split_valid_to_make_test = True + split_train_to_make_test = False + keep_fields_after_preprocess = ["label"] # The fields that should be kept even after preprocessiing + + def __init__(self, config, data_args, seed=42, default_max_length=1): + self.config = config + self.seed = seed + self.data_args = data_args + + self.default_max_length = default_max_length + + def check_n_obs(self, n_obs, total_size): + if n_obs is not None and n_obs > total_size: + n_obs = total_size + logger.warning("n_obs is set to %s", n_obs) + return n_obs + + def shuffled_indices(self, dataset): + num_samples = len(dataset) + generator = torch.Generator() + generator.manual_seed(self.seed) + return torch.randperm(num_samples, generator=generator).tolist() + + def subsample(self, dataset, n_obs=None, indices=None): + """ + Given a dataset returns the subsampled dataset. + :param n_obs: the number of samples of the subsampled dataset. + :param indices: indices to select the samples from, if not given, indices are computed + from by shuffling the given dataset. + :return: subsampled dataset. + """ + num_samples = len(dataset) + n_obs = self.check_n_obs(n_obs, num_samples) + if indices is None: + indices = self.shuffled_indices(dataset) + indices = indices[:n_obs] + return dataset.select(indices) + + def load_dataset(self, split: int): + return datasets.load_dataset(self.name, self.config, split=split, script_version="master") + + def get_split_indices(self, split, dataset, validation_size): + indices = self.shuffled_indices(dataset) + if split == "validation": + return indices[:validation_size] + else: + return indices[validation_size:] + + def preprocessor(self, example): + return example + + def get(self, split, n_obs=None, split_validation_test=False): + # For small datasets (n_samples < 10K) without test set, we divide validation set to + # half, use one half as test set and one half as validation set. + if split in ["eval", "dev", "valid"]: + split = "validation" + if split_validation_test and self.split_valid_to_make_test \ + and split != "train": + mapped_split = self.split_to_data_split["validation"] + dataset = self.load_dataset(split=mapped_split) + indices = self.get_split_indices(split, dataset, validation_size=len(dataset)//2) + dataset = self.subsample(dataset, n_obs, indices) + # For larger datasets (n_samples > 10K), we divide training set into 1K as + # validation and the rest as training set, keeping the original validation + # set as the test set. + elif split_validation_test and self.split_train_to_make_test \ + and split != "test": + dataset = self.load_dataset(split="train") + indices = self.get_split_indices(split, dataset, validation_size=1000) + dataset = self.subsample(dataset, n_obs, indices) + else: + mapped_split = self.split_to_data_split[split] + dataset = self.load_dataset(split=mapped_split) + # shuffles the data and samples it. + if n_obs is not None: + dataset = self.subsample(dataset, n_obs) + + this_method = getattr(self.__class__, 'preprocessor') + base_method = getattr(AbstractTask, 'preprocessor') + if this_method is not base_method: + return dataset.map(self.preprocessor) + else: + return dataset diff --git a/OpenDelta-0.3.2/examples/examples_prompt/data_processors/prompt.py b/OpenDelta-0.3.2/examples/examples_prompt/data_processors/prompt.py new file mode 100644 index 0000000..70f9463 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/data_processors/prompt.py @@ -0,0 +1,153 @@ +# from openprompt.prompts import ManualTemplate + +class BasePrompt(object): + def __init__(self, template_id=0, verbalizer_id=0, generation=True): + self.template = self.textual_templates[template_id] + if generation: + self.verbalizer = self.generation_verbalizers[verbalizer_id] + else: + self.verbalizer = self.mlmhead_verbalizers[verbalizer_id] + + + + def __call__(self, example): + + def eval_syntax(syntaxlist, example): + composed = [] + for x in syntaxlist: + if x.startswith("[_eval_]"): + t = eval(x[len("[_eval_]"):]) + else: + t = x + composed.append(t) + return composed + src_texts = eval_syntax(self.template,example) + + tgt_texts = self.verbalizer[str(example['label'])] + if isinstance(tgt_texts, list): + tgt_texts = eval_syntax(tgt_texts, example) + else: + tgt_texts = [tgt_texts] + return src_texts, tgt_texts + + + + + +class MRPCPrompt(BasePrompt): + generation_verbalizers = [ + { + "0": "different", + "1": "same" + }, + { + "0": "not_equivalent", + "1": "equivalent" + } + ] + mlmhead_verbalizers = { + "0": "different", + "1": "same" + } + textual_templates = [ + ["sentence1:", """[_eval_]example['sentence1']""", + "sentence2:", """[_eval_]example["sentence2"]""", "Meanings different of same? Answer: " ] + ] + +class BoolQPrompt(BasePrompt): + generation_verbalizers = [ + { + "0": "different", + "1": "same" + }, + { + "0": "not_equivalent", + "1": "equivalent" + } + ] + mlmhead_verbalizers = { + "0": "different", + "1": "same" + } + textual_templates = [ + ["sentence1:", """[_eval_]example['sentence1']""", + "sentence2:", """[_eval_]example["sentence2"]""", "Meanings different of same? Answer: " ] + ] + +class BoolQPrompt(BasePrompt): + generation_verbalizers = [ + { + "0": "no", + "1": "yes" + }, + ] + mlmhead_verbalizers = { + "0": "no", + "1": "yes" + } + textual_templates = [ + ["hypothesis:", """[_eval_]example['hypothesis']""", + "premise:", """[_eval_]example["premise"]""", "The answer was " ] + ] + +class COLAPrompt(BasePrompt): + generation_verbalizers = [ + { + "0": "No", + "1": "Yes" + }, + ] + mlmhead_verbalizers = { + "0": "No", + "1": "Yes" + } + textual_templates = [ + ["sentence:", """[_eval_]example['sentence']""", + "grammar correct? " ] + ] + + +class RTEPrompt(BasePrompt): + generation_verbalizers = [ + { + "0": "yes", + "1": "no" + }, + ] + mlmhead_verbalizers = { + "0": "yes", + "1": "no" + } + textual_templates = [ + ["sentence1:", """[_eval_]example['premise']""", "sentence2:", + """[_eval_]example['hypothesis']""", + "The answer was "] + ] + +class CBPrompt(BasePrompt): + generation_verbalizers = [{ + "0": "yes", + "1": "no", + "2": "maybe" + }, + ] + mlmhead_verbalizers = [{ + "0": "yes", + "1": "no", + "2": "maybe" + }] + textual_templates = [ + ["hypothesis:", """[_eval_]example['hypothesis']""", "premise:", + """[_eval_]example['premise']""", + "The answer was " ] + ] + +PromptCollections = { + "mrpc": MRPCPrompt, + "cola": COLAPrompt, + "rte": RTEPrompt, + "superglue-boolq": BoolQPrompt, + "cb": CBPrompt, +} + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/data_processors/tasks.py b/OpenDelta-0.3.2/examples/examples_prompt/data_processors/tasks.py new file mode 100644 index 0000000..7d0402a --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/data_processors/tasks.py @@ -0,0 +1,663 @@ +from collections import OrderedDict +import collections +import abc +import functools +from selectors import EpollSelector +from typing import Callable, List, Mapping +from .utils import pad_punctuation +from examples_prompt.metrics import metrics +from .utils import round_stsb_target +import datasets +import logging +import numpy as np +import torch +import re +import itertools +import os + +logger = logging.getLogger(__name__) + + +from transformers.models.auto.tokenization_auto import tokenizer_class_from_name + +from typing import List, Dict +from collections import defaultdict +import warnings + + +from .processor import AbstractTask + +class Squad(AbstractTask): + name = "squad" + metric = [metrics.squad] + + def load_dataset(self, split): + return datasets.load_dataset(self.name, split=split, script_version="master") + + def preprocessor(self, example, add_prefix): + answer = pad_punctuation(example['answers']['text'][0]) + question = pad_punctuation(example['question']) + context = pad_punctuation(example['context']) + source = ["question:", question, + "context:", context] + target = [answer] + return self.seq2seq_format(source, target, add_prefix) + + +##GLUE +class COLA(AbstractTask): + name = "cola" + labels_list = ["0", "1"] + metric = [metrics.matthews_corrcoef] + metric_names = ["matthews_correlation"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + templates_text = {"0": """sentence: {"meta": 'sentence', "shortenable":True} Are there any error in the sentence? {"mask"}""", + } + + verbalizers = { + "0":{ "0": "yes", "1": "no"} + } + + def load_dataset(self, split): + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.cola")[split] + else: + return datasets.load_dataset('glue', 'cola', + split=split, script_version="master") + + +class SST2(AbstractTask): + name = "sst2" + labels_list = ["0", "1"] + metric = [metrics.accuracy] + metric_names = ["accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + verbalizers = { + "0":{"0":"negative","1":"positive"}, + "blenderbot":{"0":"negative","1":"positive"} + + } + + templates_text = { + "0":"""The sentiment of sentence: "{"meta":"sentence", "shortenable":True}" is {"mask"}.""", + "blenderbot": """{"meta":"sentence", "shortenable":True} what is the sentiment?""" + } + + def load_dataset(self, split): + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.sst2")[split] + else: + return datasets.load_dataset('glue', 'sst2', + split=split, script_version="master") + + + +class MRPC(AbstractTask): + name = "mrpc" + labels_list = ["0", "1"] + metric = [metrics.f1_score, metrics.accuracy] + metric_names = ["f1", "accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + + templates_text = { + "0": """sentence1: {"meta": 'sentence1', "shortenable":True}. sentence2: {"meta":"sentence2", "shortenable":True}. Are sentence1 and sentence2 equivalent? {"mask"}.""", + } + + verbalizers = { + "0":{"0": "no","1": "yes"} + } + + + def load_dataset(self, split): + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.mrpc")[split] + else: + return datasets.load_dataset('glue', 'mrpc', split=split, script_version="master") + + + +class QQP(AbstractTask): + name = "qqp" + labels_list = ["0", "1"] + metric = [metrics.f1_score, metrics.accuracy] + metric_names = ["f1", "accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + templates_text = {"0": + """question1: {"meta": 'question1', "shortenable":True}. question2: {"meta": 'question2', "shortenable":True} Are question1 and question2 equivalent? {"mask"}.""" + } + + verbalizers = { + "0":{"0": "no","1": "yes"} + } + + + def load_dataset(self, split): + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.qqp")[split] + else: + return datasets.load_dataset('glue', 'qqp', + split=split, script_version="master") + + + +class STSB(AbstractTask): + name = "stsb" + labels_list = [str(np.round(label, decimals=1)) for label in np.arange(0, 5.2, 0.2)] + metric = [metrics.pearson_corrcoef, metrics.spearman_corrcoef] + metric_names = ["pearson", "spearmanr"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + + verbalizers = { + "" + } + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'stsb', + split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["sentence1:", example['sentence1'], + "sentence2:", example["sentence2"]] + tgt_texts = [str(round_stsb_target(example['label']))] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class MNLI(AbstractTask): + name = "mnli" + labels_list = ["0", "1", "2"] + split_to_data_split = {"train": "train", + "validation": "validation_mismatched", + "test": "validation_matched"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + + templates_text = { + "0":"""premise: {"meta": 'premise', "shortenable":True}. hypothesis: {"meta": 'hypothesis', "shortenable":True} Does the premise entails the hypothesis? {"mask"}.""", + } + + verbalizers = { + "0":{ + "0": "yes", + "1": "neutral", + "2": "no", + } + } + + def load_dataset(self, split): + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.mnli")[split] + else: + return datasets.load_dataset('glue', 'mnli', split=split, script_version="master") + + # def preprocessor(self, example, add_prefix=True): + # src_texts = ["premise:", example['premise'], + # "hypothesis", example["hypothesis"]] + # tgt_texts = [str(example['label'])] + # return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class QNLI(AbstractTask): + name = "qnli" + labels_list = ["0", "1"] + metric = [metrics.accuracy] + metric_names = ["accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + templates_text = { + "0": """premise: {"meta": 'sentence', "shortenable":True}. hypothesis: {"meta": 'question', "shortenable":True}"""+ + """Does the premise entails the hypothesis? {"mask"}.""", + } + + verbalizers = { + "0":{ + "0": "yes", + "1": "no", + } + } + + + def load_dataset(self, split): + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.qnli")[split] + else: + return datasets.load_dataset('glue', 'qnli', split=split, script_version="master") + + # def load_dataset(self, split): + # return datasets.load_dataset('glue', 'qnli', split=split, script_version="master") + + # def preprocessor(self, example, add_prefix=True): + # src_texts = ["question:", example['question'], + # "sentence:", example["sentence"]] + # tgt_texts = [str(example['label'])] + # return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + +#Tested +class RTE(AbstractTask): + name = "rte" + labels_list = ["0", "1"] + metric = [metrics.accuracy] + metric_names = ["accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + + templates_text = { + "0": """sentence1: {"meta": 'sentence1', "shortenable":True} sentence2: {"meta":"sentence2", "shortenable":True} The answer was {"mask"}.""", + } + + verbalizers = { + "0":{"0": "yes", + "1": "no" + } + } + + def load_dataset(self, split): + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.rte")[split] + else: + return datasets.load_dataset('glue', 'rte', + split=split, script_version="master") + + + +class WNLI(AbstractTask): + name = "wnli" + labels_list = ["0", "1"] + metric = [metrics.accuracy] + metric_names = ["accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + verbalizers = { + "0":{"0": "True", + "1": "False", + } + } + templates_text = {"0": """{"meta": 'sentence1',"shortenable":True} Does it mean the following: "{"meta":'sentence2'}"? {"mask"}.""" + } + + + def load_dataset(self, split): + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.wnli")[split] + else: + return datasets.load_dataset('glue', 'wnli', split=split, script_version="master") + + +#SuperGLUE +class SuperGLUEBoolQ(AbstractTask): + name="superglue-boolq" + labels_list = ['0', '1'] + metric = [metrics.accuracy] + metric_names = ["accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + verbalizers = { + "0": { + "0": "no", + "1": "yes" + }, + } + + templates_text = { + "0": """hypothesis: {"meta": "question", "shortenable":True} premise: {"meta":"passage", "shortenable":True} The answer was {"mask"}.""" + } + + def load_dataset(self, split): + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.boolq")[split] + else: + return datasets.load_dataset('super_glue', 'boolq', split=split, script_version="master") + + +# +class SuperGLUECB(AbstractTask): + name = "superglue-cb" + labels_list = ['0', '1', '2'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + verbalizers = { + "0":{"0": "yes", + "1": "no", + "2": "maybe" + } + } + templates_text = { + "0": """hypothesis: {"meta": 'hypothesis',"shortenable":True} premise: {"meta":'premise', "shortenable":True} The answer was {"mask"}.""" + } + + def load_dataset(self, split): + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.cb")[split] + else: + return datasets.load_dataset('super_glue', 'cb', split=split, script_version="master") + + +class SuperGLUECOPA(AbstractTask): + name = "superglue-copa" + labels_list = ['0', '1'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + verbalizers = { + "0":{ + "0": "1", + "1": "2", + } + } + templates_text = { + "0": """choice1: {"meta":"choice1"} choice2: {"meta":"choice2"} premise: {"meta":"premise", "shortenable":True} The {"meta":"question"} answer was choice{"mask"}.""" + } + + def load_dataset(self, split): + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.copa")[split] + else: + return datasets.load_dataset('super_glue', 'copa', split=split, script_version="master") + + +class SuperGLUEMultiRC(AbstractTask): + name = "superglue-multirc" + labels_list = ['0', '1'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.f1_score, + metrics.accuracy] + metric_names = ["f1", "em"] + + + verbalizers = { + "0": { + "0": "no", + "1": "yes", + } + } + templates_text = { + "0": """question: {"meta":"question", "shortenable":False} answer: {"meta":"answer", "shortenable":False, "post_processing": lambda x:x+"."} paragraph: {"meta":"paragraph", "shortenable":True} The answer was {"mask"}.""" + } + + + def load_dataset(self, split): + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.multirc")[split] + else: + return datasets.load_dataset('super_glue', 'multirc', split=split, script_version="master") + + def remove_markup(self, text): + """Removes the HTML markup.""" + text = re.sub('
', ' ', text) + text = re.sub('<(/)?b>', '', text) + return text + + def preprocessor(self, example): + # T5 applies remove_markup to the joined string, but this should not make + # any difference as well. + # https://github.com/google-research/text-to-text-transfer-transformer/blob/a1352e625db7ec114062f99d99b0565b9e45c155/t5/data/preprocessors.py#L797 + example["question"] = self.remove_markup(example["question"]) + example["answer"] = self.remove_markup(example["answer"]) + example["paragraph"] = self.remove_markup(example["paragraph"]) + return example + + + +class SuperGLUEWIC(AbstractTask): + name = "superglue-wic" + labels_list = ['0', '1'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + verbalizers = { + "0": { + "0": "No", + "1": "Yes", + } + } + + templates_text = { + "0": """sentence1: {"meta":"sentence1"} sentence2: {"meta":"sentence2", "shortenable": True} word: {"meta":"word"} {"mask"}.""" + } + + def load_dataset(self, split): + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.wic")[split] + else: + return datasets.load_dataset('super_glue', 'wic', split=split, script_version="master") + + +# class SuperGLUERecord(AbstractTask): +# """Convert ReCoRD examples to text2text examples. +# ReCoRD contains a passage, query containing a '@placeholder' string, and a set +# of entities that are the possible values of the placeholder. Each train and +# validation example will have a list of answers, any of which would be +# considered correct. +# For example, a typical example from ReCoRD might look like +# { +# 'passsage': 'This is the passage.', +# 'query': 'A @placeholder is a bird.', +# 'entities': ['penguin', 'potato', 'pigeon'], +# 'answers': ['penguin', 'pigeon'], +# } +# which this preprocessor would turn into the following two examples: +# { +# 'inputs': 'record query: A @placeholder is a bird. entities: penguin, ' +# 'potato, pigeon passage: This is the passage.', +# 'targets': 'penguin', +# } +# and +# { +# 'inputs': 'record query: A @placeholder is a bird. entities: penguin, ' +# 'potato, pigeon passage: This is the passage.', +# 'targets': 'pigeon', +# } +# """ +# name = "superglue-record" +# split_to_data_split = {"train": "train", +# "validation": "validation", +# "test": "validation"} +# metric = [metrics.squad] +# metric_names = ["squad"] + +# def load_dataset(self, split): +# return datasets.load_dataset('super_glue', 'record', split=split, script_version="master") + +# def preprocessor(self, batch, add_prefix=True): +# new_batch = collections.defaultdict(list) +# keys = batch.keys() +# for values in zip(*batch.values()): +# ex = {k: v for k, v in zip(keys, values)} +# # updates the passage. +# passage = ex['passage'] +# passage = re.sub(r'(\.|\?|\!|\"|\')\n@highlight\n', r'\1 ', passage) +# passage = re.sub(r'\n@highlight\n', '. ', passage) +# inputs = f"record query: {ex['query']} entities: {', '.join(ex['entities'])} passage: {passage}" +# if add_prefix: +# inputs = self.name + " " + inputs +# # duplicates the samples based on number of answers. +# num_answers = len(ex["answers"]) +# num_duplicates = np.maximum(1, num_answers) +# new_batch["source"].extend([inputs] * num_duplicates) +# new_batch["target"].extend(ex["answers"] if num_answers > 0 else [""]) +# new_batch["task"].extend([self.name] * num_duplicates) +# new_batch["extra_fields"].extend([{"answers": ex["answers"]}]*num_duplicates) +# return new_batch + +# def map_dataset(self, dataset, add_prefix=True): +# return dataset.map(functools.partial(self.preprocessor, add_prefix=add_prefix), +# batched=True, remove_columns=dataset.column_names) + +class Beans(AbstractTask): + name = "beans" + labels_list = ['angular_leaf_spot', 'bean_rust', "healthy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + verbalizers = { + "clip": { + "angular_leaf_spot": "angular_leaf_spot", + "bean_rust": "bean_rust", + "healthy": "healthy", + } + } + + templates_text = { + "clip":"""a photo of {"mask"} leaf.""" + } + + def load_dataset(self, split): + # from IPython import embed; embed(header="beans") + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/beans")[split] + else: + return datasets.load_dataset('beans', split=split, script_version="master") + +class Wikitext(AbstractTask): + #wikitext-2-v1 + name = "wikitext" + # labels_list = ['angular_leaf_spot', 'bean_rust', "healthy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.perplexity] + metric_names = ["perplexity"] + + verbalizers = { + "0": { + } + } + + templates_text = { + "0": """{"meta":"text"}""" + } + split_valid_to_make_test = True + def load_dataset(self, split): + # from IPython import embed; embed(header="beans") + if self.data_args.datasets_load_from_disk: + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/wikitext")[split] + else: + return datasets.load_dataset('wikitext','wikitext-2-v1', split=split, script_version="master") + +class Cifar10(AbstractTask): + name = "cifar10" + + split_to_data_split = {"train": "train", + "validation": "test", + "test": "test"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + if self.data_args.datasets_load_from_disk: + d = datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/cifar10")[split].select(range(100)) + print(d) + return d + else: + return datasets.load_dataset('cifar10', split=split, script_version="master") + # def preprocessor(self, example): + # example_ = {} + # example_["image"] = example["image"] + # example_["labels"] = example["label"] + + # return example_ +class Fashion_MNIST(AbstractTask): + name = "Fashion-MNIST" + + split_to_data_split = {"train": "train", + "validation": "test", + "test": "test"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + if self.data_args.datasets_load_from_disk: + d = datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/fashion_mnist")[split] + print(d) + return d + else: + return datasets.load_dataset('fashion_mnist', split=split, script_version="master") + +TASK_MAPPING = OrderedDict( + [ + ('squad', Squad), + ('mrpc', MRPC), + ('cola', COLA), + ('sst2', SST2), + ('qnli', QNLI), + ('rte', RTE), + ('wnli', WNLI), + ('mnli', MNLI), + ('qqp', QQP), + ('stsb', STSB), + ('superglue-boolq', SuperGLUEBoolQ), + ('superglue-cb', SuperGLUECB), + ('superglue-copa', SuperGLUECOPA), + ('superglue-multirc', SuperGLUEMultiRC), + ('superglue-wic', SuperGLUEWIC), + # ('superglue-record', SuperGLUERecord) + ('beans', Beans), + ('wikitext',Wikitext), + ('cifar10',Cifar10), + ('fashion_mnist',Fashion_MNIST) + ] +) + +class AutoTask: + @classmethod + def get(self, task, config, data_args, seed=42): + if task in TASK_MAPPING: + return TASK_MAPPING[task](config, data_args, seed) + raise ValueError( + "Unrecognized task {} for AutoTask Model: {}.\n" + "Task name should be one of {}.".format( + ", ".join(c for c in TASK_MAPPING.keys()) + ) + ) diff --git a/OpenDelta-0.3.2/examples/examples_prompt/data_processors/utils.py b/OpenDelta-0.3.2/examples/examples_prompt/data_processors/utils.py new file mode 100644 index 0000000..38c9f00 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/data_processors/utils.py @@ -0,0 +1,30 @@ +import numpy as np +import re + +def round_stsb_target(label): + """STSB maps two sentences to a floating point number between 1 and 5 + representing their semantic similarity. Since we are treating all tasks as + text-to-text tasks we need to convert this floating point number to a string. + The vast majority of the similarity score labels in STSB are in the set + [0, 0.2, 0.4, ..., 4.8, 5.0]. So, we first round the number to the closest + entry in this set, and then we convert the result to a string (literally e.g. + "3.4"). This converts STSB roughly into a 26-class classification dataset. + Args: + label: original label. + Returns: + A preprocessed label. + """ + return np.round((label * 5) / 5, decimals=1) + + +def pad_punctuation(text): + """Re-implementation of _pad_punctuation in t5. This function adds spaces + around punctuation. While this pads punctuation as expected, it has the + unexpected effected of padding certain unicode characters with accents, with + spaces as well. For instance: "François" becomes "Fran ç ois""" + # Pad everything except for: underscores (_), whitespace (\s), + # numbers (\p{N}), letters (\p{L}) and accent characters (\p{M}). + text = re.sub(r'([^_\s\p{N}\p{L}\p{M}])', r' \1 ', text) + # Collapse consecutive whitespace into one space. + text = re.sub(r'\s+', ' ', text) + return text \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/hyperopt/search.sh b/OpenDelta-0.3.2/examples/examples_prompt/hyperopt/search.sh new file mode 100644 index 0000000..dfda3df --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/hyperopt/search.sh @@ -0,0 +1,44 @@ + +PATHBASE=/mnt/sfs_turbo/hsd/officialod/OpenDelta-1/examples/examples_prompt/ +PYTHONPATH=/mnt/sfs_turbo/zhangshudan/anaconda3/envs/officialod/bin/python +PLMPATHBASE=/mnt/sfs_turbo/hsd/plm_cache/ # must be empty string or dir that ends with / +DATASETSPATHBASE=/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/ +RUNTIME=$(date +%m%d%H%M%S) +MODELNAME="roberta-base" +DATASET=$1 +DELTATYPES=("none" "bitfit" "lora" "adapter") +CUDAIDS=("0 1" "2 3" "4 5" "6 7") +NUMTRIALS=50 +CONTINUESTUDY=${2:-'0'} + +echo $RUNTIME +echo $MODELNAME +echo $DATASET +echo $DELTATYPE +echo $CUDAIDS +echo $NUMTRIALS +echo $CONTINUESTUDY +cd $PATHBASE + + + +for expid in 0 1 2 3 +do + ( $PYTHONPATH search_distributed.py \ + --model_name $MODELNAME \ + --dataset $DATASET \ + --delta_type ${DELTATYPES[$expid]} \ + --cuda_ids ${CUDAIDS[$expid]} \ + --num_trials $NUMTRIALS \ + --mode run \ + --repeat_time 1 \ + --main_file_name run_mlm.py \ + --pathbase $PATHBASE \ + --pythonpath $PYTHONPATH \ + --plm_path_base $PLMPATHBASE \ + --datasets_saved_path $DATASETSPATHBASE \ + --datasets_load_from_disk \ + --continue_study $CONTINUESTUDY >>/mnt/sfs_turbo/hsd/officialod/OpenDelta-1/examples/examples_prompt/out_sfs/$RUNTIME.txt 2>&1 + ) & +done +wait \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/hyperopt/search_distributed.py b/OpenDelta-0.3.2/examples/examples_prompt/hyperopt/search_distributed.py new file mode 100644 index 0000000..f6e6359 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/hyperopt/search_distributed.py @@ -0,0 +1,136 @@ +import optuna +import argparse +import os +import shutil +import subprocess + + + + +if __name__=="__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--delta_type") + parser.add_argument("--dataset") + parser.add_argument("--model_name") + parser.add_argument("--study_name", type=str, default=None) + parser.add_argument("--cuda_ids", nargs='+', help="list") + parser.add_argument("--mode", type=str, default="run", help="select from 'run' and 'read' ") + parser.add_argument("--continue_study", type=int, default=0) + parser.add_argument("--substudy_prefix", type=str, default="") + parser.add_argument("--main_file_name", type=str) + parser.add_argument("--num_trials", type=int) + parser.add_argument("--pathbase", type=str, default="") + parser.add_argument("--pythonpath", type=str, default="python") + parser.add_argument("--plm_path_base", type=str, default="", help="The path where we cache the plms. Must be empty string or dir that ends with /") + parser.add_argument("--datasets_load_from_disk", action="store_true") + parser.add_argument("--datasets_saved_path", type=str) + parser.add_argument("--repeat_time", type=int, default=1) + args = parser.parse_args() + + + pardir = ".".join([args.delta_type, args.dataset, args.model_name]) + if args.study_name is None: + args.study_name = pardir + else: + args.study_name += pardir + + setattr(args, "output_dir", f"{args.pathbase}/outputs_search/{args.study_name}") + + + + if args.mode == "run": + if args.continue_study==1: + print("Continue study!") + else: + print("Creat new study!") + + if not os.path.exists(f"{args.output_dir}"): + os.mkdir(f"{args.output_dir}") + else: + if not args.continue_study: + user_cmd = "yes" #input("Detected existing study, are you sure to create new by removing old? [Yes/No]") + + while user_cmd.lower() not in ["yes", "no"]: + print("Please input Yes/No") + user_cmd = input("Detected existing study, are you sure to create new by removing old? [Yes/No]") + + if user_cmd.lower() == "no": + exit() + shutil.rmtree(f"{args.output_dir}") + os.mkdir(f"{args.output_dir}") + + try: + study = optuna.create_study(study_name=args.study_name, storage=f"sqlite:///{args.study_name}.db") + except optuna.exceptions.DuplicatedStudyError: + if not args.continue_study: + optuna.delete_study(study_name=args.study_name, storage=f"sqlite:///{args.study_name}.db") + study = optuna.create_study(study_name=args.study_name, storage=f"sqlite:///{args.study_name}.db") + else: + pass # no need to create study + + tot_chunk_num = len(args.cuda_ids) + + subprocesses = [] + for id, cudas in enumerate(args.cuda_ids): + if id+1 < tot_chunk_num: + sub_n_trials = args.num_trials//tot_chunk_num + else: + sub_n_trials = args.num_trials//tot_chunk_num + args.num_trials%tot_chunk_num + + command = f"{args.pythonpath} search_single.py " + command += f"--cuda_id {cudas} " + command += f"--model_name {args.model_name} " + command += f"--dataset {args.dataset} " + command += f"--delta_type {args.delta_type} " + command += f"--study_name {args.study_name} " + command += f"--optuna_seed 10{id} " + command += f"--main_file_name {args.main_file_name} " + command += f"--num_trials {sub_n_trials} " + command += f"--pythonpath {args.pythonpath} " + command += f"--pathbase {args.pathbase} " + command += f"--repeat_time {args.repeat_time} " + command += f"--plm_path_base {args.plm_path_base} " + command += f"--datasets_saved_path {args.datasets_saved_path} " + if args.datasets_load_from_disk: + command += f"--datasets_load_from_disk " + command += f"> {args.output_dir}/{args.substudy_prefix}{id}.log 2>&1" + p = subprocess.Popen(command, cwd=f"{args.pathbase}", shell=True) + subprocesses.append(p) + print("id {} on cuda:{}, pid {}".format(id, cudas, p.pid)) + print(command) + print() + + print("Wait for subprocesses to complete") + exit_codes = [p.wait() for p in subprocesses] + print("All complete!") + + elif args.mode == 'read': + study = optuna.load_study(study_name=args.study_name, storage=f"sqlite:///{args.study_name}.db") + trial = study.best_trial + finished = (len(study.trials) == args.num_trials) + print("total num_trials: {}, {}".format(len(study.trials), "Finished!" if finished else "Not finished..." )) + print("average acc {}".format(-trial.value)) + print("best config {}".format(trial.params)) + + best_trial_dir = trial.user_attrs["trial_dir"] + shutil.copyfile(f"{best_trial_dir}/this_configs.json", f"{args.output_dir}/best_config.json") + + plot_history = optuna.visualization.plot_optimization_history(study) + plot_slice = optuna.visualization.plot_slice(study) + plot_contour = optuna.visualization.plot_contour(study, params=['learning_rate', 'batch_size_base']) + plot_contour2 = optuna.visualization.plot_contour(study, params=['learning_rate', 'warmup_steps']) + + + plot_history.write_image(f"{args.output_dir}/history.png") + plot_slice.write_image(f"{args.output_dir}/slice.png") + plot_contour.write_image(f"{args.output_dir}/contour.png") + plot_contour2.write_image(f"{args.output_dir}/contour2.png") + + + + + + + + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/hyperopt/search_single.py b/OpenDelta-0.3.2/examples/examples_prompt/hyperopt/search_single.py new file mode 100644 index 0000000..efe7c80 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/hyperopt/search_single.py @@ -0,0 +1,117 @@ + +import os +import argparse +import random +import json +from examples_prompt.search_space import AllBackboneSearchSpace, AllDeltaSearchSpace, BaseSearchSpace, DatasetSearchSpace +import optuna +from functools import partial +from optuna.samplers import TPESampler +import shutil +import time + +import subprocess + + +def objective_singleseed(args, unicode, search_space_sample ): + os.mkdir(f"{args.output_dir}/{unicode}") + search_space_sample.update({"output_dir": f"{args.output_dir}/{unicode}"}) + + + with open(f"{args.output_dir}/{unicode}/this_configs.json", 'w') as fout: + json.dump(search_space_sample, fout, indent=4,sort_keys=True) + + + command = "CUDA_VISIBLE_DEVICES={} ".format(args.cuda_id) + command += f"{args.pythonpath} {args.main_file_name} " + command += f"{args.output_dir}/{unicode}/this_configs.json" + command += f" >> {args.output_dir}/{unicode}/output.log 2>&1" + + + print("======"*5+"\n"+command) + p = subprocess.Popen(command, cwd=f"{args.pathbase}", shell=True) + print(f"wait for subprocess \"{command}\" to complete") + p.wait() + + # if status_code != 0: + # with open(f"{args.output_dir}/{args.cuda_id}.log",'r') as flog: + # lastlines = " ".join(flog.readlines()[-100:]) + # if "RuntimeError: CUDA out of memory." in lastlines: + # time.sleep(600) # sleep ten minites and try again + # shutil.rmtree(f"{args.output_dir}/{unicode}/") + # return objective_singleseed(args, unicode, search_space_sample) + # else: + # raise RuntimeError("error in {}".format(unicode)) + + + + with open(f"{args.output_dir}/{unicode}/results.json", 'r') as fret: + results =json.load(fret) + + for filename in os.listdir(f"{args.output_dir}/{unicode}/"): + if not filename.endswith("this_configs.json"): + full_file_name = f"{args.output_dir}/{unicode}/{filename}" + if os.path.isdir(full_file_name): + shutil.rmtree(f"{args.output_dir}/{unicode}/{filename}") + else: + os.remove(full_file_name) + + results_all_test_datasets = [] + print("results:", results) + for datasets in results['test']: + results_all_test_datasets.append(results['test'][datasets]['test_average_metrics']) + + return sum(results_all_test_datasets)/len(results_all_test_datasets)#results['test']['average_metrics'] + + + +def objective(trial, args=None): + search_space_sample = {} + search_space_sample.update(BaseSearchSpace().get_config(trial, args)) + search_space_sample.update(AllBackboneSearchSpace[args.model_name]().get_config(trial, args)) + search_space_sample.update(DatasetSearchSpace(args.dataset).get_config(trial, args)) + search_space_sample.update(AllDeltaSearchSpace[args.delta_type]().get_config(trial, args)) + results = [] + for seed in range(42, 42+args.repeat_time): + search_space_sample.update({"seed": seed}) + unicode = random.randint(0, 100000000) + while os.path.exists(f"{args.output_dir}/{unicode}"): + unicode = unicode+1 + trial.set_user_attr("trial_dir", f"{args.output_dir}/{unicode}") + res = objective_singleseed(args, unicode = unicode, search_space_sample=search_space_sample) + results.append(res) + ave_res = sum(results)/len(results) + return -ave_res + + + + +if __name__=="__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--delta_type") + parser.add_argument("--dataset") + parser.add_argument("--model_name") + parser.add_argument("--cuda_id", type=int) + parser.add_argument("--main_file_name", type=str) + parser.add_argument("--study_name") + parser.add_argument("--num_trials", type=int) + parser.add_argument("--repeat_time", type=int) + parser.add_argument("--optuna_seed", type=int, default="the seed to sample suggest point") + parser.add_argument("--pathbase", type=str, default="") + parser.add_argument("--pythonpath", type=str, default="") + parser.add_argument("--plm_path_base", type=str, default="") + parser.add_argument("--datasets_load_from_disk", action="store_true") + parser.add_argument("--datasets_saved_path", type=str) + + args = parser.parse_args() + + + setattr(args, "output_dir", f"{args.pathbase}/outputs_search/{args.study_name}") + + study = optuna.load_study(study_name=args.study_name, storage=f'sqlite:///{args.study_name}.db', sampler=TPESampler(seed=args.optuna_seed)) + study.optimize(partial(objective, args=args), n_trials=args.num_trials) + + print("complete single!") + + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/hyperopt/search_space.py b/OpenDelta-0.3.2/examples/examples_prompt/hyperopt/search_space.py new file mode 100644 index 0000000..4cc904f --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/hyperopt/search_space.py @@ -0,0 +1,284 @@ +import collections +import copy + + + + +class BaseSearchSpace: + def get_config(self, trail, args=None): + return { + "do_train": True, + "do_eval": True, + "do_test": True, + + + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "save_strategy": "steps", + "datasets_load_from_disk": args.datasets_load_from_disk, + "datasets_saved_path": args.datasets_saved_path + + } + + + +class BitFitSearchSpace: + def get_config(self, trail, args=None): + learning_rate = trail.suggest_loguniform('learning_rate', 1e-5, 1e-1) + return { + "delta_type": "bitfit", + 'learning_rate': learning_rate, + } + +class AdapterSearchSpace: + def get_config(self, trail, args=None): + learning_rate = trail.suggest_loguniform('learning_rate', 1e-5, 1e-1) + # bottleneck_dim_base = trail.suggest_int("bottleneck_dim_base", 1, 3) + # bottleneck_dim = int(2*4**(bottleneck_dim_base-1)) + bottleneck_dim = 32 + return { + "delta_type": "adapter", + 'learning_rate': learning_rate, + 'bottleneck_dim': bottleneck_dim + } + +class SoftPromptSearchSpace: + def get_config(self, trail, args=None): + learning_rate = trail.suggest_loguniform('learning_rate', 1e-5, 1e-1) + soft_token_num = 100 + return { + "delta_type": "soft_prompt", + 'learning_rate': learning_rate, + "soft_token_num":soft_token_num, + "token_init": False, + } + +class FinetuneSearchSpace: + def get_config(self, trail, args=None): + learning_rate = trail.suggest_loguniform('learning_rate', 1e-5, 1e-1) + return { + "delta_type": "none", + 'learning_rate': learning_rate, + } + +class LoRASearchSpace: + def get_config(self, trail, args=None): + learning_rate = trail.suggest_loguniform('learning_rate', 1e-5, 1e-1) + lora_r = 8 + return { + "delta_type": "lora", + "learning_rate": learning_rate, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": lora_r, + } + +class CompacterSearchSpace: + def get_config(self, trail, args=None): + learning_rate = trail.suggest_loguniform('learning_rate', 1e-5, 1e-1) + return { + "delta_type": "compacter", + "learning_rate": learning_rate, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + } + +class CompacterppSearchSpace: + def get_config(self, trail, args=None): + learning_rate = trail.suggest_loguniform('learning_rate', 1e-5, 1e-1) + if args.model_name_or_path.split("/")[-1].startswith('t5'): + modified_modules = [ + "DenseReluDense" + ] + else: + raise NotImplementedError + + return { + "delta_type": "compacter", + "learning_rate": learning_rate, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "modified_modules": modified_modules, + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + } + +class LowRankAdapterSearchSpace: + def get_config(self, trail, args=None): + learning_rate = trail.suggest_loguniform('learning_rate', 1e-5, 1e-1) + low_rank_rank = 1 + return { + "delta_type": "low_rank_adapter", + "learning_rate": learning_rate, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": low_rank_rank, + } + + + +class PrefixSearchSpace: + def get_config(self, trail, args=None): + learning_rate = trail.suggest_loguniform('learning_rate', 1e-5, 1e-1) + return { + "delta_type": "prefix", + "learning_rate": learning_rate, + "unfrozen_modules": [ + "deltas", + ], + } + + + + +class T5BaseSearchSpace: + def get_config(self, trail, args=None): + batch_size_base = trail.suggest_int('batch_size_base', 1, 4) + if batch_size_base >= 4: + gradient_accumulation_steps = 2**(batch_size_base-3) + else: + gradient_accumulation_steps = 1 + batch_size = int(16 * 2**(min(batch_size_base,3)-1)) + warmup_steps = trail.suggest_categorical('warmup_steps', [0, 500]) + return { + "model_name_or_path": f"{args.plm_path_base}t5-base", # change here for loading from custom path + "tokenizer_name": f"{args.plm_path_base}t5-base", # change here for loading from custom path + 'batch_size':batch_size, + "per_device_train_batch_size": batch_size, + "per_device_eval_batch_size": batch_size, + "warmup_steps": warmup_steps, + "gradient_accumulation_steps": gradient_accumulation_steps, + "save_steps": 200, + "eval_steps": 200, + "max_steps": 5000, + "predict_with_generate": True, + } + + +class RobertaBaseSearchSpace: + def get_config(self, trail, args=None): + batch_size_base = trail.suggest_int('batch_size_base', 1, 4) + if batch_size_base >= 4: + gradient_accumulation_steps = 2**(batch_size_base-3) + else: + gradient_accumulation_steps = 1 + batch_size = int(16 * 2**(min(batch_size_base,3)-1)) + warmup_steps = trail.suggest_categorical('warmup_steps', [0, 500]) + return { + "model_name_or_path": f"{args.plm_path_base}roberta-base", # change here for loading from custom path + "tokenizer_name": f"{args.plm_path_base}roberta-base", # change here for loading from custom path + 'batch_size':batch_size, + "per_device_train_batch_size": batch_size, + "per_device_eval_batch_size": batch_size, + "warmup_steps": warmup_steps, + "gradient_accumulation_steps": gradient_accumulation_steps, + "save_steps": 200, + "eval_steps": 200, + "max_steps": 5000, + "predict_with_generate": False, + } + + + +class DatasetSearchSpace: + dataset_order = ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb", "wnli"] + dataset_config = {("task_name", "eval_dataset_name", "test_dataset_name", + "max_source_length"): list(zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb", "wnli"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb", "wnli"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb", "wnli"], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128, 128], + ))} + def __init__(self, dataset_name): + self.dataset_name = dataset_name + fields = list(list(self.dataset_config.keys())[0]) + dataset_id = self.dataset_order.index(dataset_name) + values = list(self.dataset_config.values())[0][dataset_id] + self.fixed_params = {f:v for f, v in zip(fields, values)} + + def get_config(self, trail, args=None): + return self.fixed_params + + + + + +AllDeltaSearchSpace = { + "none": FinetuneSearchSpace, + "bitfit": BitFitSearchSpace, + "adapter": AdapterSearchSpace, + "compacter": CompacterSearchSpace, + "compacterpp": CompacterppSearchSpace, + "lora": LoRASearchSpace, + "prefix": PrefixSearchSpace, + "lowrankadapter":LowRankAdapterSearchSpace, + +} + +AllBackboneSearchSpace = { + "t5-base": T5BaseSearchSpace, + "roberta-base": RobertaBaseSearchSpace, +} + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/hyperopt/seq2seq_trainer.py b/OpenDelta-0.3.2/examples/examples_prompt/hyperopt/seq2seq_trainer.py new file mode 100644 index 0000000..fd5b174 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/hyperopt/seq2seq_trainer.py @@ -0,0 +1,127 @@ +from packaging import version +import torch +from torch import nn +from typing import Any, Dict, List, Optional, Tuple, Union + +from torch.utils.data.dataset import Dataset +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainner +from examples_prompt.trainers.trainer import BaseTrainer + + # if is_sagemaker_mp_enabled(): +# import smdistributed.modelparallel.torch as smp + +# from transformers.trainer_utils import ShardedDDPOption + +# if is_fairscale_available(): +# dep_version_check("fairscale") +# import fairscale +# from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP +# from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP +# from fairscale.nn.wrap import auto_wrap +# from fairscale.optim import OSS +# from fairscale.optim.grad_scaler import ShardedGradScaler + +from transformers.optimization import Adafactor, AdamW, get_scheduler +from transformers.trainer_pt_utils import get_parameter_names, is_sagemaker_mp_enabled +from transformers.integrations import is_fairscale_available + + + +if version.parse(torch.__version__) >= version.parse("1.6"): + from torch.cuda.amp import autocast + + +class Seq2SeqTrainer(HfSeq2SeqTrainner, BaseTrainer): + def __init__(self, train_dataset_sizes=None, delta_args=None, *args, **kwargs): + super().__init__(*args, **kwargs) + self.train_dataset_sizes = train_dataset_sizes + self.delta_args = delta_args + + def evaluate( + self, + eval_dataset: Optional[Dict[str, Dataset]] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + max_length: Optional[int] = None, + num_beams: Optional[int] = None, + ) -> Dict[str, float]: + # TODO: this also needs to be set per dataset + self._max_length = max_length + self._num_beams = num_beams + return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) + + + def prediction_step( + self, + model: nn.Module, + inputs: Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + gen_kwargs = { + "max_length": self._max_length if self._max_length is not None else self.model.config.max_length, + "num_beams": self._num_beams if self._num_beams is not None else self.model.config.num_beams, + } + generated_tokens = self.model.generate( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + **gen_kwargs, + ) + # in case the batch is shorter than max length, the output should be padded + if generated_tokens.shape[-1] < gen_kwargs["max_length"]: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + + with torch.no_grad(): + if self.use_amp: + with autocast(): + outputs = model(**inputs) + else: + outputs = model(**inputs) + if has_labels: + if self.label_smoother is not None: + loss = self.label_smoother(outputs, inputs["labels"]).mean().detach() + else: + loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach() + else: + loss = None + + if self.args.prediction_loss_only: + return (loss, None, None) + + labels = inputs["labels"] + if labels.shape[-1] < gen_kwargs["max_length"]: + labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + + return (loss, generated_tokens, labels) + + + + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/metrics/__init__.py b/OpenDelta-0.3.2/examples/examples_prompt/metrics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/OpenDelta-0.3.2/examples/examples_prompt/metrics/metrics.py b/OpenDelta-0.3.2/examples/examples_prompt/metrics/metrics.py new file mode 100644 index 0000000..94267b0 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/metrics/metrics.py @@ -0,0 +1,222 @@ +# several of the evaluation metrics are from https://github.com/google-research/text-to-text-transfer-transformer/blob/a1352e625db7ec114062f99d99b0565b9e45c155/t5/evaluation/metrics.py +"""Defines different metrics used for evaluation of tasks.""" +import numpy as np +import scipy +import math +import sklearn +import collections +from logging import getLogger +from .qa_utils import normalize_squad, qa_metrics +import sklearn.metrics + +logger = getLogger(__name__) + +def perplexity(outputs, targets,ignore_index=-100): + """Computes the perplexity accuracy.""" + + ce = -np.log(outputs).mean() + # ce = F.cross_entropy(torch.Tensor(outputs).view(-1, outputs.shape[-1]), torch.Tensor(targets).view(-1).long(),ignore_index=ignore_index) + + return {"perplexity":float(np.exp(ce))} + +def accuracy(predictions, targets) -> dict: + """Computes the average accuracy.""" + return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())} + +def pearson_corrcoef(predictions, targets) -> dict: + """Computes Pearson correlation coefficient.""" + from examples_seq2seq.data_processors.postprocessors import string_to_float + targets = [string_to_float(target) for target in targets] + predictions= [string_to_float(prediction) for prediction in predictions] + pearson_corrcoef = 100 * scipy.stats.pearsonr(targets, predictions)[0] + + # Note that if all the predictions will be the same, spearman + # correlation is nan, to gaurad against this, we check the output + # and return 0 in this case. + if math.isnan(pearson_corrcoef): + pearson_corrcoef = 0 + return {"pearson": pearson_corrcoef} + + +def spearman_corrcoef(predictions, targets) -> dict: + """Computes Spearman correlation coefficient.""" + # TODO: we need to do postprocessors in a clean way for each dataset. + from examples_seq2seq.data_processors.postprocessors import string_to_float + targets = [string_to_float(target) for target in targets] + predictions= [string_to_float(prediction) for prediction in predictions] + spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0] + + # Note that if all the predictions will be the same, spearman + # correlation is nan, to gaurad against this, we check the output + # and return 0 in this case. + if math.isnan(spearman_corrcoef): + spearman_corrcoef = 0 + return {"spearmanr": spearman_corrcoef} + + + +# def spearman_corrcoef(predictions, targets) -> dict: +# """Computes Spearman correlation coefficient.""" +# # TODO: we need to do postprocessors in a clean way for each dataset. +# from examples_seq2seq.data_processors.postprocessors import string_to_float +# targets = [string_to_float(target) for target in targets] +# predictions= [string_to_float(prediction) for prediction in predictions] +# spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0] + +# # Note that if all the predictions will be the same, spearman +# # correlation is nan, to gaurad against this, we check the output +# # and return 0 in this case. +# if math.isnan(spearman_corrcoef): +# spearman_corrcoef = 0 +# return {"spearmanr": spearman_corrcoef} + + +def f1_score_with_invalid(predictions, targets) -> dict: + """Computes F1 score, with any prediction != 0 or 1 is counted as incorrect. + Args: + targets: list of targets, either 0 or 1 + predictions: list of predictions, any integer value + Returns: + F1 score, where any prediction != 0 or 1 is counted as wrong. + """ + def binary_reverse(labels): + return ['0' if label == '1' else '1' for label in labels] + targets, predictions = np.asarray(targets), np.asarray(predictions) + # Get indices of invalid predictions. + invalid_idx_mask = np.logical_and(predictions != '0', predictions != '1') + # For any prediction != 0 or 1, we set the prediction to the opposite of its corresponding target. + predictions[invalid_idx_mask] = binary_reverse(targets[invalid_idx_mask]) + targets = targets.astype(np.int32) + predictions = predictions.astype(np.int32) + return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)} + + + +def transform_for_generation(predictions, targets): + mapping = {k: i for i, k in enumerate(set(targets))} + + targets = np.asarray([mapping[k] for k in targets]) + predictions = np.asarray([mapping[k] if k in mapping else (t+1)%len(mapping) for t, k in zip(targets, predictions)]) + + return predictions, targets + + + +def f1_score(predictions, targets) -> dict: + """Computes F1 score, with any prediction != 0 or 1 is counted as incorrect. + Args: + targets: list of targets, either 0 or 1 + predictions: list of predictions, any integer value + Returns: + F1 score, where any prediction != 0 or 1 is counted as wrong. + """ + targets = np.array(targets).astype(np.int32) + predictions = np.array(predictions).astype(np.int32) + return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)} + +# TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow +def matthews_corrcoef(predictions, targets) -> dict: + """Computes the Matthews correlation coefficient.""" + return {"matthews_correlation": 100 * sklearn.metrics.matthews_corrcoef(targets, predictions)} + +def squad(predictions, targets): + """Computes SQuAD metrics, maximizing over answers per question. + Args: + targets: list of lists of strings + predictions: list of strings + Returns: + dict with score_key: squad score across all targets and predictions + """ + + targets = [[normalize_squad(t) for t in u] for u in targets] + predictions = [normalize_squad(p) for p in predictions] + return qa_metrics(targets, predictions) + + +def exact_match(predictions, targets): + """Computes whether the targets match predictions exactly.""" + return {"em": 100 * float(np.array_equal(targets, predictions))} + + +def sklearn_metrics_wrapper(metric_str, + metric_dict_str=None, + metric_post_process_fn=None, + **metric_fn_kwargs): + """Wraps any sklearn.metric function and returns a t5 metric function. + Args: + metric_str: string, the function from `sklearn.metrics` to use. + metric_dict_str: optional string, if not specified `metric_str` is used as + the key in the returned dictionary. + metric_post_process_fn: callable, if specified the final computed metric + will be passed through this. + **metric_fn_kwargs: kwargs, passed to the metric function we are calling. + Returns: + the function that calculates the metric in a dict. + """ + if not hasattr(sklearn.metrics, metric_str): + raise ValueError("sklearn.metrics does not have: %s" % metric_str) + + def fn(predictions, targets): + metric_fn = getattr(sklearn.metrics, metric_str) + metric_val = metric_fn(targets, predictions, **metric_fn_kwargs) + if metric_post_process_fn is not None: + metric_val = metric_post_process_fn(metric_val) + return {metric_dict_str or metric_str: metric_val} + return fn + + +def mean_multiclass_f1(num_classes, **metric_fn_kwargs): + """Computes the unweighted average of the F1 per class.""" + return sklearn_metrics_wrapper( + "fbeta_score", + metric_dict_str="f1_multiclass", + metric_post_process_fn=lambda x: 100 * x, + beta=1, + labels=range(num_classes), + average="macro", + **metric_fn_kwargs) + + +def multirc_f1_over_all_answers(targets, predictions): + """Special metric for MultiRC which computes F1 score over all examples. + This is necessary because the targets/predictions for MultiRC are dicts and + the f1_score_with_invalid expects a list of True/False labels, not dicts. As + a result we just need to key in the "value" for each of the example dicts + before feeding into f1_score_with_invalid. + Args: + targets: list of dicts, where each dict has a "value" key. + predictions: list of dicts, where each dict has a "value" key. + Returns: + F1 score over values, where any prediction != 0 or 1 is counted as wrong. + """ + return f1_score_with_invalid( + [t["value"] for t in targets], [p["value"] for p in predictions] + ) + + +def mean_group_metric(metric_fn, group_key="group", value_key="value"): + """Returns a metric that averages `metric_fn` on sub-groups of results. + The sub-groups are defined by aggregating results (targets and predictions) + by accessing the feature specified by `group_key` in the target dicts. + **WARNING**: Using this function can produce unreliable results if you do not + pass in full groups. For example, if you evaluate over a random subsample of a + validation set and do not retain all of the examples in each group, you may + get results which aren't directly comparable to using the full validation set. + Args: + metric_fn: function, the metric to compute on the subgroups. + group_key: string, the key for the grouping value in the target dictionary. + value_key: string, the key for the value in the dictionaries. + """ + def my_metric(targets, predictions): + """Computes mean of `metric_fn` over subgroups of results.""" + grouped_values = collections.defaultdict(lambda: ([], [])) + for targ, pred in zip(targets, predictions): + g = targ[group_key] + grouped_values[g][0].append(targ[value_key]) + grouped_values[g][1].append(pred[value_key]) + group_scores = collections.defaultdict(list) + for (targets, predictions) in grouped_values.values(): + for metric, score in metric_fn(targets, predictions).items(): + group_scores[metric].append(score) + return {metric: np.mean(scores) for metric, scores in group_scores.items()} + return my_metric diff --git a/OpenDelta-0.3.2/examples/examples_prompt/metrics/qa_utils.py b/OpenDelta-0.3.2/examples/examples_prompt/metrics/qa_utils.py new file mode 100644 index 0000000..fe3fb0c --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/metrics/qa_utils.py @@ -0,0 +1,96 @@ +# Copyright 2021 The T5 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# source: the codes are from https://github.com/google-research/text-to-text-transfer-transformer +"""Utilities for Question Answering (QA) evaluation. +Matches results on the SQuAD (v1.1) and TriviaQA (v1.0) evaluation scripts. +""" + +import collections +import string +import regex as re +import numpy as np + + +def _normalize_answer(text, punc_chars, punc_repl): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(s): + return re.sub(r"\b(a|an|the)\b", " ", s) + + def replace_punctuation(s): + to_replace = set(punc_chars) + return "".join(punc_repl if ch in to_replace else ch for ch in s) + + def white_space_fix(s): + return " ".join(s.split()) + + text = text.lower() + text = replace_punctuation(text) + text = remove_articles(text) + text = white_space_fix(text) + return text + + +def normalize_trivia_qa(answer): + """Normalization used in official TriviaQA evaluation script.""" + return _normalize_answer( + answer, punc_chars=string.punctuation + "‘’´`_", punc_repl=" ").strip() + + +def normalize_squad(answer): + """Normalization used in official SQuAD evaluation script.""" + return _normalize_answer(answer, punc_chars=string.punctuation, punc_repl="") + + +def _metric_max_over_ground_truths(metric_fn, ground_truths, prediction): + """Computes the maximum of the metric over all ground truths.""" + return max( + metric_fn(ground_truth, prediction) for ground_truth in ground_truths + ) + + +def _exact_match_score(target, prediction): + return target == prediction + + +def _f1_score(target, prediction): + """Computes token f1 score for a single target and prediction.""" + prediction_tokens = prediction.split() + target_tokens = target.split() + common = (collections.Counter(prediction_tokens) & + collections.Counter(target_tokens)) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(target_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def qa_metrics(targets, predictions): + """Computes exact match and f1 QA scores, expecting pre-normalized text.""" + if len(targets) != len(predictions): + raise ValueError("Number of targets and predictions must match.") + em = np.mean([ + _metric_max_over_ground_truths(_exact_match_score, t, p) + for p, t in zip(predictions, targets) + ]) + f1 = np.mean([ + _metric_max_over_ground_truths(_f1_score, t, p) + for p, t in zip(predictions, targets) + ]) + em *= 100 + f1 *= 100 + return {"em": em, "f1": f1} diff --git a/OpenDelta-0.3.2/examples/examples_prompt/requirements.txt b/OpenDelta-0.3.2/examples/examples_prompt/requirements.txt new file mode 100644 index 0000000..f1d8a3f --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/requirements.txt @@ -0,0 +1,4 @@ +optuna +sklearn +openpromptu +tensorboard \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/src/run.py b/OpenDelta-0.3.2/examples/examples_prompt/src/run.py new file mode 100644 index 0000000..8fbe808 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/src/run.py @@ -0,0 +1,356 @@ +# coding=utf-8 +# Copyright OpenDelta Team and THUNLP lab. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +A unified runing scripts for most models to do down stream tasks in a +prompt learning fashion, i.e., No classification head, all tasks are casted +to mask prediction or span prediction tasks. + +Processing relevant to different backbone models are stored in ../backbones/ + +Adding A few lines to integrate the Delta tuning methods. + +You can also adapt this script on your own tasks. +""" + +import os +import sys + +os.environ['MKL_THREADING_LAYER'] = 'GNU' +os.environ['MKL_SERVICE_FORCE_INTEL'] = '1' +os.environ["TOKENIZERS_PARALLELISM"] = "false" +sys.path.append(os.path.join(os.getcwd(), "../")) +# sys.path.append(os.path.join(os.getcwd(), "/mnt/sfs_turbo/zhangzhen/OpenDelta")) +sys.path.append(os.path.join(os.getcwd())) + +import functools +import logging +import torch +import json +import numpy as np + +import transformers +from transformers import ( + AutoConfig, + AutoModelForMaskedLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + DataCollatorForSeq2Seq, + # HfArgumentParser, + # MBartTokenizer, + # default_data_collator, + Trainer, + Seq2SeqTrainer, + set_seed, +) +from transformers.trainer_utils import is_main_process, get_last_checkpoint + +from data_processors import AutoTask #, #TaskDataCollatorForSeq2Seq, AutoPostProcessor, data_collator +from utils import read_json, save_json +from utils.args import ModelArguments, TrainingArguments, DataTrainingArguments, DeltaArguments, RemainArgHfArgumentParser + + +logger = logging.getLogger(__name__) + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, DeltaArguments)) + + # You can provide a json file with contains the arguments and use the --argument some_arg to override or append to the json file. + json_file, cmd_args = (os.path.abspath(sys.argv[1]), sys.argv[2:]) if sys.argv[1].endswith(".json") else (None, sys.argv[1:]) + model_args, data_args, training_args, delta_args, remain_args = parser.parse_json_file_with_cmd_args(json_file=json_file, command_line_args=cmd_args) + logger.warning("The following arguments not used! {}".format(remain_args)) + + logger.info(f"The results will be used in {training_args.output_dir}/results.json") + # exit() + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + print("#### last_checkpoint ", last_checkpoint) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + ''' + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + ''' + pass + elif last_checkpoint is not None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + # Set the verbosity to info of the Transformers logger (on main process only): + if is_main_process(training_args.local_rank): + transformers.utils.logging.set_verbosity_info() + # logger.info("Training/evaluation parameters %s", training_args, model_args, data_args, delta_args) + logger.info("{}\n{}\n{}\n{}".format(training_args, model_args, data_args, delta_args)) + + + # Set seed before initializing model. + set_seed(training_args.seed) + + + + if os.path.basename(model_args.model_name_or_path).startswith("t5") \ + or os.path.basename(model_args.model_name_or_path).startswith("long-t5") : + from examples_prompt.backbones.t5 import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.t5 import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("blenderbot"): + from examples_prompt.backbones.blenderbot import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.blenderbot import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("roberta") \ + or os.path.basename(model_args.model_name_or_path).startswith("bert") \ + or os.path.basename(model_args.model_name_or_path).startswith("albert") \ + or os.path.basename(model_args.model_name_or_path).startswith("xlm-roberta") \ + or os.path.basename(model_args.model_name_or_path).startswith("deberta") : + from examples_prompt.backbones.bert import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.bert import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("beit"): + from examples_prompt.backbones.beit import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.beit import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("bart"): + from examples_prompt.backbones.bart import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.bart import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("bigbird"): + from examples_prompt.backbones.bigbird import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.bigbird import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("clip"): + from examples_prompt.backbones.clip import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.clip import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("opt") \ + or os.path.basename(model_args.model_name_or_path).startswith("gpt"): + from examples_prompt.backbones.opt import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.opt import Trainer, DataCollator + + + + + + config, tokenizer, model = get_backbone(model_args=model_args) + + # model parallelize + if hasattr(training_args, "model_parallel") and training_args.model_parallel: + logger.info('parallelize model!') + model.parallelize() + + from bigmodelvis import Visualization + Visualization(model).structure_graph() + + if delta_args.delta_type.lower() != "none": + from opendelta import AutoDeltaConfig,AutoDeltaModel + from dataclasses import asdict + delta_config = AutoDeltaConfig.from_dict(asdict(delta_args)) + delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=model) + delta_model.freeze_module(set_state_dict = True) + delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True) + + + + + + performance_metrics = {} + + + + + non_empty_splits_names = [] + if training_args.do_train: + non_empty_splits_names.append("train") + if training_args.do_eval: + non_empty_splits_names.append("eval") + if training_args.do_test: + non_empty_splits_names.append("test") + splits = {} + for split_name in ['train', 'eval', 'test']: + if split_name not in non_empty_splits_names: + splits[split_name] = None + continue + + task = AutoTask.get(data_args.task_name, + data_args.dataset_config_name, + data_args=data_args, + seed=data_args.data_sample_seed) + + dataset = task.get(split=split_name, + split_validation_test=training_args.split_validation_test, + n_obs=data_args.max_train_samples) + + + + template, _verbalizer, tokenizer_wrapper = get_prompts(task, tokenizer, data_args) + + + dataset = dataset.map( + functools.partial(preprocess_function, + data_args=data_args, + tokenizer=tokenizer, + template=template, + verbalizer=_verbalizer, + tokenizer_wrapper=tokenizer_wrapper, + split=split_name), + batched=False, + num_proc=data_args.preprocessing_num_workers, + remove_columns=get_remove_columns(list(dataset.features.keys())), + load_from_cache_file=not data_args.overwrite_cache, + ) + # from IPython import embed; embed() + splits[split_name] = dataset + if split_name == "eval": + eval_task = task + verbalizer = _verbalizer + + + + trainer = Trainer( + model=model, + verbalizer=verbalizer, + eval_task=eval_task, + args=training_args, + train_dataset=splits['train'], + eval_dataset=splits['eval'], + tokenizer=tokenizer, + data_collator=DataCollator(tokenizer), + ) + + + def save_training_config(config_file, output_dir): + json_data = read_json(config_file) + save_json(os.path.join(output_dir, "training_config.json"), json_data) + + + # Saves training config. + if trainer.is_world_process_zero(): + save_training_config(sys.argv[1], training_args.output_dir) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + + if training_args.compute_time: + torch.cuda.synchronize() # wait for move to complete + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + + train_result = trainer.train(resume_from_checkpoint=checkpoint) + + if training_args.compute_time: + end.record() + torch.cuda.synchronize() # wait for all_reduce to complete + total_time = start.elapsed_time(end)/(1000*60) + performance_metrics.update({"total_time in minutes ": total_time}) + + trainer.save_model() # Saves the tokenizer too for easy upload + train_metrics = train_result.metrics + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(splits['train']) + ) + train_metrics["train_samples"] = min(max_train_samples, len(splits['train'])) + trainer.log_metrics("train", train_metrics) + trainer.save_metrics("train", train_metrics) + trainer.save_state() + + if torch.cuda.is_available() and training_args.compute_memory: + peak_memory = (torch.cuda.max_memory_allocated() / 1024 ** 2)/1000 + performance_metrics.update({"peak_memory": peak_memory}) + if training_args.compute_memory or training_args.compute_time: + logger.info("Efficiency Statistics {}".format(performance_metrics)) + trainer.save_metrics("performance", performance_metrics) + + # Evaluation + all_results = {} + + all_results['evaluate'] = {} + + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate(eval_dataset=splits['eval'], + ) + trainer.log_metrics(f"{data_args.task_name}_eval", metrics) + trainer.save_metrics(f"{data_args.task_name}_eval", metrics) + all_results['evaluate'][data_args.task_name] = metrics + + # Test + all_results['test'] = {} + if training_args.do_test: + logger.info("*** Test ***") + metrics = trainer.evaluate(eval_dataset=splits['test'], + metric_key_prefix="test" + ) + trainer.log_metrics(f"{data_args.task_name}_test", metrics) + trainer.save_metrics(f"{data_args.task_name}_test", metrics) + all_results['test'][data_args.task_name] = metrics + + # from opendelta.utils.delta_hub import create_hub_repo_name + # from opendelta.utils.delta_center import create_delta_center_args, create_repo_name + + # repo_name = create_hub_repo_name(root="DeltaHub", + # dataset=data_args.task_name, + # delta_type = delta_args.delta_type, + # model_name_or_path= model_args.model_name_or_path) + + # center_args = + # repo_name = create_repo_name(prefix="", center_args=center_args) + # all_results['repo_name'] = repo_name + + + delta_model.save_finetuned(finetuned_delta_path=delta_args.finetuned_delta_path, + push_to_dc=training_args.push_to_dc, + center_args={"test_performance":all_results['test'][data_args.task_name]['test_average_metrics'], + }, + center_args_pool = {**vars(model_args), **vars(data_args), **vars(training_args), **vars(delta_args)}, + list_tags = ['NLI'], + dict_tags = {'purpose':'for testing'}, + delay_push=True, + test_result=all_results['test'] + ) + + + + with open(f"{training_args.output_dir}/results.json", 'w') as fout: + string = json.dumps(all_results, indent=4,sort_keys=True) + fout.write(string+"\n") + + return all_results + + + + +if __name__ == "__main__": + result = main() + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/src/test.py b/OpenDelta-0.3.2/examples/examples_prompt/src/test.py new file mode 100644 index 0000000..8466eb3 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/src/test.py @@ -0,0 +1,344 @@ +# coding=utf-8 +# Copyright OpenDelta Team and THUNLP lab. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +A unified runing scripts for most models to do down stream tasks in a +prompt learning fashion, i.e., No classification head, all tasks are casted +to mask prediction or span prediction tasks. + +Processing relevant to different backbone models are stored in ../backbones/ + +Adding A few lines to integrate the Delta tuning methods. + +You can also adapt this script on your own tasks. +""" + +import os +import sys +os.environ['MKL_THREADING_LAYER'] = 'GNU' +os.environ['MKL_SERVICE_FORCE_INTEL'] = '1' +os.environ["TOKENIZERS_PARALLELISM"] = "false" +sys.path.append(os.path.join(os.getcwd(), "../")) +sys.path.append(os.path.join(os.getcwd())) + +import functools +import logging +import torch +import json +import numpy as np + +import transformers +from transformers import ( + AutoConfig, + AutoModelForMaskedLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + DataCollatorForSeq2Seq, + # HfArgumentParser, + # MBartTokenizer, + # default_data_collator, + Trainer, + Seq2SeqTrainer, + set_seed, +) +from transformers.trainer_utils import is_main_process, get_last_checkpoint + +from data_processors import AutoTask #, #TaskDataCollatorForSeq2Seq, AutoPostProcessor, data_collator +from utils import read_json, save_json +from utils.args import ModelArguments, TrainingArguments, DataTrainingArguments, RemainArgHfArgumentParser, DeltaArguments + + +logger = logging.getLogger(__name__) + + +def main(): + parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, DeltaArguments)) + + # You can provide a json file with contains the arguments and use the --argument some_arg to override or append to the json file. + json_file, cmd_args = (os.path.abspath(sys.argv[1]), sys.argv[2:]) if sys.argv[1].endswith(".json") else (None, sys.argv[1:]) + model_args, data_args, training_args, delta_args, remain_args = parser.parse_json_file_with_cmd_args(json_file=json_file, command_line_args=cmd_args) + logger.warning("The following arguments not used! {}".format(remain_args)) + + # # exit() + # # Detecting last checkpoint. + # last_checkpoint = None + # if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + # last_checkpoint = get_last_checkpoint(training_args.output_dir) + # print("#### last_checkpoint ", last_checkpoint) + # if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + # ''' + # raise ValueError( + # f"Output directory ({training_args.output_dir}) already exists and is not empty. " + # "Use --overwrite_output_dir to overcome." + # ) + # ''' + # pass + # elif last_checkpoint is not None: + # logger.info( + # f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + # "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + # ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + # Set the verbosity to info of the Transformers logger (on main process only): + if is_main_process(training_args.local_rank): + transformers.utils.logging.set_verbosity_info() + # logger.info("Training/evaluation parameters %s", training_args, model_args, data_args, delta_args) + logger.info("{}\n{}\n{}\n{}".format(training_args, model_args, data_args, delta_args)) + + + # Set seed before initializing model. + set_seed(training_args.seed) + + + + if os.path.basename(model_args.model_name_or_path).startswith("t5"): + from examples_prompt.backbones.t5 import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.t5 import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("blenderbot"): + from examples_prompt.backbones.blenderbot import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.blenderbot import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("roberta") \ + or os.path.basename(model_args.model_name_or_path).startswith("bert") \ + or os.path.basename(model_args.model_name_or_path).startswith("albert") : + from examples_prompt.backbones.bert import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.bert import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("beit"): + from examples_prompt.backbones.beit import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.beit import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("bart"): + from examples_prompt.backbones.bart import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.bart import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("bigbird"): + from examples_prompt.backbones.bigbird import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.bigbird import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("clip"): + from examples_prompt.backbones.clip import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.clip import Trainer, DataCollator + + + + config, tokenizer, model = get_backbone(model_args=model_args) + + # model parallelize + if hasattr(training_args, "model_parallel") and training_args.model_parallel: + logger.info('parallelize model!') + model.parallelize() + + from bigmodelvis import Visualization + Visualization(model).structure_graph() + + if delta_args.delta_type.lower() != "none": + from opendelta.delta_models.adapter import AdapterConfig, AdapterModel + delta_config = AdapterConfig.from_finetuned(finetuned_delta_path=delta_args.finetuned_delta_path) + delta_model = AdapterModel.from_finetuned(finetuned_delta_path=delta_args.finetuned_delta_path, + delta_config=delta_config, + backbone_model=model, + force_download=delta_args.force_download, + cache_dir=delta_args.delta_cache_dir) + # delta_model.freeze_module(set_state_dict = True) + delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True) + + + performance_metrics = {} + + + + + non_empty_splits_names = [] + # if training_args.do_train: + # non_empty_splits_names.append("train") + # if training_args.do_eval: + # non_empty_splits_names.append("eval") + if training_args.do_test: + non_empty_splits_names.append("test") + splits = {} + for split_name in ['test']: + if split_name not in non_empty_splits_names: + splits[split_name] = None + continue + + task = AutoTask.get(data_args.task_name, + data_args.dataset_config_name, + data_args=data_args, + seed=data_args.data_sample_seed) + + dataset = task.get(split=split_name, + split_validation_test=training_args.split_validation_test, + n_obs=data_args.max_train_samples) + + + + template, _verbalizer, tokenizer_wrapper = get_prompts(task, tokenizer, data_args) + + + dataset = dataset.map( + functools.partial(preprocess_function, + data_args=data_args, + tokenizer=tokenizer, + template=template, + verbalizer=_verbalizer, + tokenizer_wrapper=tokenizer_wrapper, + split=split_name), + batched=False, + num_proc=data_args.preprocessing_num_workers, + remove_columns=get_remove_columns(list(dataset.features.keys())), + load_from_cache_file=not data_args.overwrite_cache, + ) + # from IPython import embed; embed() + splits[split_name] = dataset + if split_name == "test": + eval_task = task + verbalizer = _verbalizer + + + + trainer = Trainer( + model=model, + verbalizer=verbalizer, + eval_task=eval_task, + args=training_args, + # train_dataset=splits['train'], + # eval_dataset=splits['eval'], + tokenizer=tokenizer, + data_collator=DataCollator(tokenizer), + ) + + + def save_training_config(config_file, output_dir): + json_data = read_json(config_file) + save_json(os.path.join(output_dir, "training_config.json"), json_data) + + + # Saves training config. + if trainer.is_world_process_zero(): + save_training_config(sys.argv[1], training_args.output_dir) + + # # Training + # if training_args.do_train: + # checkpoint = None + # if training_args.resume_from_checkpoint is not None: + # checkpoint = training_args.resume_from_checkpoint + # elif last_checkpoint is not None: + # checkpoint = last_checkpoint + + # if training_args.compute_time: + # torch.cuda.synchronize() # wait for move to complete + # start = torch.cuda.Event(enable_timing=True) + # end = torch.cuda.Event(enable_timing=True) + # start.record() + + # train_result = trainer.train(resume_from_checkpoint=checkpoint) + + # if training_args.compute_time: + # end.record() + # torch.cuda.synchronize() # wait for all_reduce to complete + # total_time = start.elapsed_time(end)/(1000*60) + # performance_metrics.update({"total_time in minutes ": total_time}) + + # trainer.save_model() # Saves the tokenizer too for easy upload + # train_metrics = train_result.metrics + # max_train_samples = ( + # data_args.max_train_samples if data_args.max_train_samples is not None else len(splits['train']) + # ) + # train_metrics["train_samples"] = min(max_train_samples, len(splits['train'])) + # trainer.log_metrics("train", train_metrics) + # trainer.save_metrics("train", train_metrics) + # trainer.save_state() + + # if torch.cuda.is_available() and training_args.compute_memory: + # peak_memory = (torch.cuda.max_memory_allocated() / 1024 ** 2)/1000 + # print( + # "Memory utilization", + # peak_memory, + # "GB" + # ) + # performance_metrics.update({"peak_memory": peak_memory}) + # if training_args.compute_memory or training_args.compute_time: + # print("Efficiency Statistics {}".format(performance_metrics)) + # trainer.save_metrics("performance", performance_metrics) + + # Evaluation + all_results = {} + + # all_results['evaluate'] = {} + + # if training_args.do_eval: + # logger.info("*** Evaluate ***") + + # metrics = trainer.evaluate(eval_dataset=splits['eval'], + # ) + # trainer.log_metrics(f"{data_args.task_name}_eval", metrics) + # trainer.save_metrics(f"{data_args.task_name}_eval", metrics) + # all_results['evaluate'][data_args.task_name] = metrics + + # Test + all_results['test'] = {} + if training_args.do_test: + logger.info("*** Test ***") + metrics = trainer.evaluate(eval_dataset=splits['test'], + metric_key_prefix="test" + ) + trainer.log_metrics(f"{data_args.task_name}_test", metrics) + trainer.save_metrics(f"{data_args.task_name}_test", metrics) + all_results['test'][data_args.task_name] = metrics + + # from opendelta.utils.delta_hub import create_hub_repo_name + # from opendelta.utils.delta_center import create_delta_center_args, create_repo_name + + # repo_name = create_hub_repo_name(root="DeltaHub", + # dataset=data_args.task_name, + # delta_type = delta_args.delta_type, + # model_name_or_path= model_args.model_name_or_path) + + # center_args = + # repo_name = create_repo_name(prefix="", center_args=center_args) + # all_results['repo_name'] = repo_name + + + # delta_model.save_finetuned(push_to_hf=training_args.push_to_hf, + # push_to_dc=training_args.push_to_dc, + # center_args={}, + # center_args_pool = {**vars(model_args), **vars(data_args), **vars(training_args), **vars(delta_args)}, + # delay_push=True, + # ) + + print(all_results) + + + + # with open(f"{training_args.output_dir}/results.json", 'w') as fout: + # string = json.dumps(all_results, indent=4,sort_keys=True) + # fout.write(string+"\n") + + return all_results + + + + +if __name__ == "__main__": + result = main() + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/utils/__init__.py b/OpenDelta-0.3.2/examples/examples_prompt/utils/__init__.py new file mode 100644 index 0000000..90f60fd --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/utils/__init__.py @@ -0,0 +1 @@ +from .utils import * \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/examples_prompt/utils/args.py b/OpenDelta-0.3.2/examples/examples_prompt/utils/args.py new file mode 100644 index 0000000..23bdab8 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/utils/args.py @@ -0,0 +1,472 @@ +from dataclasses import dataclass, field +from typing import Optional, List +from transformers import HfArgumentParser +from pathlib import Path +import sys + + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + num_classes:Optional[int]=field( + default=None, metadata={"help": "The number of classes, used to initialize classification models"} + ) + + + +from transformers import TrainingArguments as HfTrainingArguments +# run_seq2seq parameters. + +@dataclass +class TrainingArguments(HfTrainingArguments): + print_num_parameters: Optional[bool] = field(default=False, metadata={"help": "If set, print the parameters of " + "the model."}) + do_test: Optional[bool] = field(default=False, metadata={"help": "If set, evaluates the test performance."}) + split_validation_test: Optional[bool] = field(default=False, + metadata={"help": "If set, for the datasets which do not" + "have the test set, we use validation set as their" + "test set and make a validation set from either" + "splitting the validation set into half (for smaller" + "than 10K samples datasets), or by using 1K examples" + "from training set as validation set (for larger" + " datasets)."}) + compute_time: Optional[bool] = field(default=True, metadata={"help": "If set measures the time."}) + compute_memory: Optional[bool] = field(default=True, metadata={"help": "if set, measures the memory"}) + is_seq2seq: Optional[bool] = field(default=True, metadata={"help": "whether the pipeline is a seq2seq one"}) + sortish_sampler: bool = field(default=False, metadata={"help": "Whether to use SortishSampler or not."}) + predict_with_generate: bool = field( + default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."} + ) + generation_max_length: Optional[int] = field( + default=None, + metadata={ + "help": "The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default " + "to the `max_length` value of the model configuration." + }, + ) + generation_num_beams: Optional[int] = field( + default=None, + metadata={ + "help": "The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default " + "to the `num_beams` value of the model configuration." + }, + ) + remove_unused_columns: Optional[bool] = field( + default=False, metadata={"help": "Remove columns not required by the model when using an nlp.Dataset."} + ) + push_to_hf: Optional[bool] = field(default=False, metadata={"help": "Push the model to huggingface model hub."}) + push_to_dc: Optional[bool] = field(default=True, metadata={"help": "Push the model to delta center."}) + + + + + + + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + task_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + eval_dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the evaluation dataset to use (via the datasets library)."} + ) + eval_dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the evaluation dataset to use (via the datasets library)."} + ) + test_dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the test dataset to use (via the datasets library)."} + ) + test_dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the test dataset to use (via the datasets library)."} + ) + overwrite_cache: bool = field( + default=True, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + max_source_length: Optional[int] = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + max_target_length: Optional[int] = field( + default=128, + metadata={ + "help": "The maximum total sequence length for target text after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + val_max_target_length: Optional[int] = field( + default=None, + metadata={ + "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`." + "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " + "during ``evaluate`` and ``predict``." + }, + ) + test_max_target_length: Optional[int] = field( + default=None, + metadata={ + "help": "The maximum total sequence length for test target text after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`." + "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " + "during ``evaluate`` and ``predict``." + }, + ) + pad_to_max_length: bool = field( + default=False, + metadata={ + "help": "Whether to pad all samples to model maximum sentence length. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " + "efficient on GPU but very bad for TPU." + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_val_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of validation examples to this " + "value if set." + }, + ) + max_test_samples: Optional[int] = field( + default=None, + metadata={"help": "For debugging purposes or quicker training, truncate the number of test examples to this " + "value if set."} + ) + num_beams: Optional[int] = field(default=None, metadata={"help": "Number of beams to use for evaluation."}) + ignore_pad_token_for_loss: bool = field( + default=True, + metadata={ + "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not." + }, + ) + task_adapters: Optional[List[str]] = field( + default=None, + metadata={"help": "Defines a dictionary from task adapters to the tasks."} + ) + task_embeddings: Optional[List[str]] = field( + default=None, + metadata={"help": "Defines a dictionary from tasks to the tasks embeddings."} + ) + datasets_load_from_disk: Optional[bool] = field( + default=False, metadata={"help": "Whether to load datasets from disk"} + ) + datasets_saved_path: Optional[str] = field( + default=None, metadata={"help": "the path of the saved datasets"} + ) + data_sample_seed: Optional[int] = field(default=42, metadata={"help": "seed used to shuffle the data."}) + + + model_parallel: Optional[bool] = field(default=False, metadata={"help": "whether apply model parallelization"}) + + def __post_init__(self): + if self.task_name is None: + raise ValueError("Need either a dataset name or a training/validation file.") + if self.val_max_target_length is None: + self.val_max_target_length = self.max_target_length + if self.test_max_target_length is None: + self.test_max_target_length = self.max_target_length + + + +import dataclasses + +@dataclass +class DeltaArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + delta_type: str= field(default="", metadata={"help": "the type of delta"}) + backbone_model: Optional[str] = field( + default="", metadata={"help": "the backbone model"} + ) + model_path_public: Optional[str] = field( + default="", metadata={"help": "the path (url) of the publicly available backbone model"} + ) + modified_modules: Optional[List[str]] = field( + default_factory=lambda: None, metadata={"help": "the modules inside the backbone to be modified"} + ) + unfrozen_modules: Optional[List[str]] = field( + default_factory=lambda:["deltas"], metadata={"help": "the modules inside the backbone or in the delta modules that need to be unfrozen"} + ) + finetuned_delta_path: Optional[str] = field( + default=None, metadata={"help": "the path of the finetuned delta model"} + ) + force_download: Optional[bool] = field( + default=False, metadata={"help": "whether to download the checkpoint form delta center no matter whether it exists"} + ) + local_files_only: Optional[bool] = field( + default=False, metadata={"help": "whether not to look for file in delta center"} + ) + delta_cache_dir: Optional[str] = field( + default=None, metadata={"help": "The cache path defined by user. If not set, we will firstly look into the"+ + " working directory and then into the default cache path (ususally ~/.cache/delta_center)."} + ) + delay_push: Optional[bool] = field( + default=True, metadata={ + 'help':'whether push the checkpoint to delta center later.' + } + ) + + def merge_arguments(self, objb): + print(objb) + self.__class__ = dataclasses.make_dataclass('DeltaArgument', fields=[(s.name, s.type, getattr(objb, s.name)) for s in dataclasses.fields(objb)], bases=(DeltaArguments,)) + + + + +@dataclass +class AdapterArguments: + bottleneck_dim: Optional[int] = field( + default=24, metadata={"help": "the dimension of the bottleneck layer"} + ) +@dataclass +class LoRAArguments: + lora_r: Optional[int] = field( + default=8, metadata={"help": "the rank of the LoRA metrics."} + ) +@dataclass +class PrefixArguments: + pass +@dataclass +class BitFitArguments: + pass +@dataclass +class SoftPromptArguments: + soft_token_num: Optional[int] = field( + default=100, metadata={"help": "the num of soft tokens."} + ) + +@dataclass +class CompacterArguments: + pass +@dataclass +class LowRankAdapterArguments: + pass + +# from opendelta.delta_models.adapter import AdapterConfig +# from opendelta.delta_models.bitfit import BitFitConfig +# from opendelta.delta_models.compacter import CompacterConfig +# from opendelta.delta_models.lora import LoraArguments +# from opendelta.delta_models.low_rank_adapter import LowRankAdapterConfig +# from opendelta.delta_models.prefix import PrefixConfig +# from opendelta.delta_models.soft_prompt import SoftPromptConfig +# DELTAARGMAP = { +# "adapter": AdapterConfig, +# "lora":LoraArguments, +# "prefix":PrefixConfig, +# "bitfit":BitFitConfig, +# "soft_prompt":SoftPromptConfig, +# "compacter":CompacterConfig, +# "low_rank_adapter":LowRankAdapterConfig + +# } + +DELTAARGMAP = { + "adapter": AdapterArguments, + "lora":LoRAArguments, + "prefix":PrefixArguments, + "bitfit":BitFitArguments, + "soft_prompt":SoftPromptArguments, + "compacter":CompacterArguments, + "low_rank_adapter":LowRankAdapterArguments + +} + +# TODO: add more specific delta arguments + + + +class RemainArgHfArgumentParser(HfArgumentParser): + '''This is a more powerful version of argument parser. + It can receiven both command line arguments and json file arguments. + The command line arguments will override the json file arguments. + The parser will load the specific delta arguments (e.g. Adapter's) + according to the delta_type argument. And merge the specific delta arguments + with the common delta arguments. + ''' + def parse_json_file_with_cmd_args(self, json_file: str, command_line_args=None, return_remaining_args=True ): + """ + Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the + dataclass types. + """ + + import json + from pathlib import Path + + + + data = json.loads(Path(json_file).read_text()) + + + data_str = "" + if command_line_args is None: + command_line_args = [] + for key in data: + if "--"+key not in command_line_args: + if isinstance(data[key], list): + data_str += "--"+key + for elem in data[key]: + data_str+=" "+ str(elem) + data_str += " " + else: + data_str+= "--" + key + " " + str(data[key]) + " " + + data_list = data_str.split() + data_list += command_line_args + + + if return_remaining_args: + outputs, remain_args = self.parse_args_into_dataclasses(args=data_list, return_remaining_strings=return_remaining_args) + for d in outputs: + if isinstance(d, DeltaArguments): # merge the specific delta arguments + d.merge_arguments(outputs[-1]) + + return [*(outputs[:-1]), remain_args] + else: + outputs = self.parse_args_into_dataclasses(args=data_list, return_remaining_strings=return_remaining_args) + for d in outputs: + if isinstance(d, DeltaArguments): + d.merge_arguments(outputs[-1]) + return [*(outputs[:-1]),] + + def parse_args_into_dataclasses( + self, args=None, return_remaining_strings=False, look_for_args_file=True, args_filename=None + ): + """ + Parse command-line args into instances of the specified dataclass types. + + This relies on argparse's `ArgumentParser.parse_known_args`. See the doc at: + docs.python.org/3.7/library/argparse.html#argparse.ArgumentParser.parse_args + + Args: + args: + List of strings to parse. The default is taken from sys.argv. (same as argparse.ArgumentParser) + return_remaining_strings: + If true, also return a list of remaining argument strings. + look_for_args_file: + If true, will look for a ".args" file with the same base name as the entry point script for this + process, and will append its potential content to the command line args. + args_filename: + If not None, will uses this file instead of the ".args" file specified in the previous argument. + + Returns: + Tuple consisting of: + + - the dataclass instances in the same order as they were passed to the initializer.abspath + - if applicable, an additional namespace for more (non-dataclass backed) arguments added to the parser + after initialization. + - The potential list of remaining argument strings. (same as argparse.ArgumentParser.parse_known_args) + """ + if args_filename or (look_for_args_file and len(sys.argv)): + if args_filename: + args_file = Path(args_filename) + else: + args_file = Path(sys.argv[0]).with_suffix(".args") + + if args_file.exists(): + fargs = args_file.read_text().split() + args = fargs + args if args is not None else fargs + sys.argv[1:] + # in case of duplicate arguments the first one has precedence + # so we append rather than prepend. + namespace, remaining_args = self.parse_known_args(args=args) + + # conditionally add delta arguments + deltatype_args = DELTAARGMAP[namespace.delta_type] + self.dataclass_types.append(deltatype_args) + self._add_dataclass_arguments(deltatype_args) + + # parse the arguments again, this time with the specific delta type's arguments + namespace, remaining_args = self.parse_known_args(args=args) + + + outputs = [] + for dtype in self.dataclass_types: + keys = {f.name for f in dataclasses.fields(dtype) if f.init} + inputs = {k: v for k, v in vars(namespace).items() if k in keys} + for k in keys: + delattr(namespace, k) + obj = dtype(**inputs) + outputs.append(obj) + if len(namespace.__dict__) > 0: + # additional namespace. + outputs.append(namespace) + if return_remaining_strings: + return (outputs, remaining_args) + else: + if remaining_args: + raise ValueError(f"Some specified arguments are not used by the HfArgumentParser: {remaining_args}") + + return outputs + + # namespace, remaining_args = self.parse_known_args(args=data_list) + + # print("Here", command_line_args, data_list,namespace, remaining_args) + # data.update(remain_args) + + # outputs = [] + # for dtype in self.dataclass_types: + # keys = {f.name for f in dataclasses.fields(dtype) if f.init} + # inputs = {k: namespace.get(k) for k in list(data.keys()) if k in keys} + # obj = dtype(**inputs) + # outputs.append(obj) + + # # remain_args = argparse.ArgumentParser() + # remain_args.__dict__.update(remain_args) + # if return_remaining_args: + # return (*outputs, remain_args) + # else: + # return (*outputs,) + + diff --git a/OpenDelta-0.3.2/examples/examples_prompt/utils/utils.py b/OpenDelta-0.3.2/examples/examples_prompt/utils/utils.py new file mode 100644 index 0000000..e995fa3 --- /dev/null +++ b/OpenDelta-0.3.2/examples/examples_prompt/utils/utils.py @@ -0,0 +1,48 @@ + +import json +import os +import re + + + +# class EvalPrediction(NamedTuple): +# """ +# Evaluation output (always contains labels), to be used to compute metrics. +# Parameters: +# predictions (:obj:`np.ndarray`): Predictions of the model. +# label_ids (:obj:`np.ndarray`): Targets to be matched. +# data_info: (:obj:`Dict[str, Any]`): Extra dataset information, one requires +# to performs the evaluation. The data_info is a dictionary with keys from +# train, eval, test to specify the data_info for each split of the dataset. +# """ +# predictions: Union[np.ndarray, Tuple[np.ndarray]] +# label_ids: np.ndarray +# data_info: Dict[str, Any] + +def create_dir(output_dir): + """ + Checks whether to the output_dir already exists and creates it if not. + Args: + output_dir: path to the output_dir + """ + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + +def get_last_checkpoint(output_dir): + if os.path.exists(os.path.join(output_dir, 'pytorch_model.bin')): + return output_dir + return None + + + + + +def save_json(filepath, dictionary): + with open(filepath, "w") as outfile: + json.dump(dictionary, outfile) + + +def read_json(filepath): + f = open(filepath,) + return json.load(f) \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/README.md b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/README.md new file mode 100644 index 0000000..38c5b22 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/README.md @@ -0,0 +1,64 @@ +# Appling OpenDelta to GLUE/SuperGLUE tasks using Seq2Seq Paradigm + + +## install the repo +```bash +cd ../ +python setup_seq2seq.py develop +``` +This will add `examples_seq2seq` to the environment path of the python lib. + +## Generating the json configuration file + +``` +python config_gen.py --job $job_name + +``` +The available job configuration (e.g., `--job lora_t5-base`) can be seen from `config_gen.py`. You can also +create your only configuration. + + +## Run the code + +``` +python run_seq2seq.py configs/$job_name/$dataset.json +``` + +## Possible Errors + +1. +``` +ValueError: You must login to the Hugging Face hub on this computer by typing `transformers-cli login` and entering your credentials to use `use_auth_token=Tr +ue`. Alternatively, you can pass your own token as the `use_auth_token` argument. +``` +- Solution 1: Please register an account on [HuggingFace](https://huggingface.co/) +Then run transformers-cli login on your command line to enter the username and password. + +- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False + +2. +``` +OSError: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once). +``` + +- Solution 1: +``` +wget -P ~ https://github.com/git-lfs/git-lfs/releases/download/v3.0.2/git-lfs-linux-amd64-v3.0.2.tar.gz +cd ~ +tar -xvzf git-lfs-linux-amd64-v3.0.2.tar.gz +export PATH=~:$PATH +git-lfs install +``` + +- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False + + +3. dataset connection error + +Solution 1: open a python console, running the error command again, may not be useful + +Solution 2: download the dataset by yourself on a internect connected machine, saved to disk and transfer to your server, at last load_from_disk. + + +## Link to the original training scripts +This example repo is based on the [compacter training scripts](https://github.com/rabeehk/compacter), with compacter-related lines removed. Thanks to the authors of the original repo. In addition, in private correspondence with the authors, they shared the codes to create the json configs. Thanks again for their efforts. diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/__init__.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/configs/config_gen_bs.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/configs/config_gen_bs.py new file mode 100644 index 0000000..1d9f238 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/configs/config_gen_bs.py @@ -0,0 +1,410 @@ +import collections +import copy + +BS = 1 +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['t5-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "t5-base", + "tokenizer_name": "t5-base", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +BaseConfigs['t5-large'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "/home/hushengding/plm_cache/t5-large", + "tokenizer_name": "/home/hushengding/plm_cache/t5-large", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +BaseConfigs['t5-3b'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "/home/hushengding/plm_cache/t5-3b", + "tokenizer_name": "/home/hushengding/plm_cache/t5-3b", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['bitfit_t5-base'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-base/", + }) + + + +AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['adapter_t5-base'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-base/", + }) + +AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['lora_t5-base'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-base/", + }) + +AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter++_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['low_rank_adapter_t5-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/t5-base/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['soft_prompt_t5-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/t5-base/", + }) + +AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['prefix_t5-base'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-base/", + }) + +AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['none_t5-base'].update({ + "delta_type": "none", + "learning_rate": 3e-5, + "output_dir": "outputs/none/t5-base/", + }) + +AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['bitfit_t5-large'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-large/", + }) + +AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['none_t5-large'].update({ + "delta_type": "none", + "learning_rate": 3e-5, + "output_dir": "outputs/none/t5-large/", + }) + + +AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['bitfit_t5-3b'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-3b/", + }) + +AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['none_t5-3b'].update({ + "delta_type": "none", + "learning_rate": 3e-5, + "output_dir": "outputs/none/t5-3b/", + }) + +AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['adapter_t5-3b'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-3b/", + }) + +AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['adapter_t5-large'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-large/", + }) + +AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['lora_t5-large'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-large/", + }) + +AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['lora_t5-3b'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-3b/", + }) + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"./{args.job}_{BS}/"): + os.mkdir(f"./{args.job}_{BS}/") + + for job_name in all_config_jsons: + with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/data_processors/__init__.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/data_processors/__init__.py new file mode 100644 index 0000000..8b9d6bc --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/data_processors/__init__.py @@ -0,0 +1,3 @@ +from .tasks import TASK_MAPPING, AutoTask +from .data_collator import TaskDataCollatorForSeq2Seq +from .postprocessors import AutoPostProcessor diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/data_processors/data_collator.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/data_processors/data_collator.py new file mode 100644 index 0000000..744a929 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/data_processors/data_collator.py @@ -0,0 +1,16 @@ +import numpy as np +from dataclasses import dataclass +from transformers import DataCollatorForSeq2Seq + + +@dataclass +class TaskDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): + def check_uniqueness(self, samples): + assert len(np.unique(samples)) == 1 + + def __call__(self, features): + # tasks = [d.pop('task') for d in features] + # self.check_uniqueness(tasks) + output = super().__call__(features) + # output["task"] = tasks[0] + return output \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/data_processors/postprocessors.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/data_processors/postprocessors.py new file mode 100644 index 0000000..a4155b5 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/data_processors/postprocessors.py @@ -0,0 +1,64 @@ +import abc +from collections import OrderedDict +import numpy as np + +"""Defines functions to process the outputs to make them ready for the evaluation.""" + +def string_to_float(string, default=-1., **unused_kwargs): + """Converts string to float, using default when conversion not possible.""" + try: + return float(string) + except ValueError: + return default + + +class PostProcessor(abc.ABC): + """Postprocess the predictions and labels to make them suitable for + evaluation.""" + def __init__(self, tokenizer, ignore_pad_token_for_loss): + self.tokenizer = tokenizer + self.ignore_pad_token_for_loss = ignore_pad_token_for_loss + + def process(self, preds, labels, data_info=None): + if isinstance(preds, tuple): + preds = preds[0] + decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) + if self.ignore_pad_token_for_loss: + # Replace -100 in the labels as we can't decode them. + labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id) + decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) + # Some simple post-processing + decoded_preds = [pred.strip() for pred in decoded_preds] + decoded_labels = [label.strip() for label in decoded_labels] + return decoded_preds, decoded_labels + + +class MultiRC(PostProcessor): + def process(self, preds, labels, data_info): + preds, labels = super().process(preds, labels, data_info) + preds = [{"group": info["group"], "value":pred} \ + for info, pred in zip(data_info, preds)] + labels = [{"group": info["group"], "value": label}\ + for info, label in zip(data_info, labels)] + return preds, labels + +class Record(PostProcessor): + def process(self, preds, labels, data_info): + preds, labels = super().process(preds, labels, data_info) + labels = [info["answers"] for info in data_info] + return preds, labels + + +POSTPROCESSOR_MAPPING = OrderedDict( + [ + ('superglue-record', Record), + ('superglue-multirc', MultiRC) + ] +) + +class AutoPostProcessor: + @classmethod + def get(self, task, tokenizer, ignore_pad_token_for_loss): + if task in POSTPROCESSOR_MAPPING: + return POSTPROCESSOR_MAPPING[task](tokenizer, ignore_pad_token_for_loss) + return PostProcessor(tokenizer, ignore_pad_token_for_loss) diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/data_processors/tasks.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/data_processors/tasks.py new file mode 100644 index 0000000..a4f8f44 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/data_processors/tasks.py @@ -0,0 +1,584 @@ +from collections import OrderedDict +import collections +import abc +import functools +from typing import Callable, List, Mapping +from examples_seq2seq.trainers.trainer_utils import pad_punctuation +from examples_seq2seq.metrics import metrics +from .utils import round_stsb_target +import datasets +import logging +import numpy as np +import torch +import re + +logger = logging.getLogger(__name__) + +class AbstractTask(abc.ABC): + name = NotImplemented + config = NotImplemented + prefix = NotImplemented + preprocessor: Callable = NotImplemented + metric = NotImplemented + metric_names = NotImplemented + split_map = None + labels_list = None + split_to_data_split: Mapping[str, str] = \ + {"train": "train", "validation": "validation", "test": "test"} + small_datasets_without_all_splits = ["cola", "wnli", "rte", "superglue-cb", "superglue-copa", "superglue-multirc", + "superglue-wic", "superglue-wsc.fixed", "superglue-rte", "mrpc", "stsb", + "superglue-boolq"] + large_data_without_all_splits = ["qqp", "qnli", "superglue-record", "sst2"] + + def __init__(self, config, seed=42): + self.config = config + self.seed = seed + + def get_max_target_length(self, tokenizer, default_max_length): + if self.labels_list is not None: + return max([len(tokenizer.encode(label)) for label in self.labels_list]) + return default_max_length + + def seq2seq_format(self, sources: List[str], + targets: List[str], + add_prefix: bool=False, + prefix: str=None, + extra_fields={}): + src_prefix = self.name if prefix is None else prefix + sources = [src_prefix]+sources if add_prefix else sources + return {'source': ' '.join(sources), + 'target': ' '.join(targets), + 'task': self.name, + 'extra_fields': extra_fields} + + def check_n_obs(self, n_obs, total_size): + if n_obs is not None and n_obs > total_size: + n_obs = total_size + logger.warning("n_obs is set to %s", n_obs) + return n_obs + + def shuffled_indices(self, dataset): + num_samples = len(dataset) + generator = torch.Generator() + generator.manual_seed(self.seed) + return torch.randperm(num_samples, generator=generator).tolist() + + def subsample(self, dataset, n_obs=None, indices=None): + """ + Given a dataset returns the subsampled dataset. + :param n_obs: the number of samples of the subsampled dataset. + :param indices: indices to select the samples from, if not given, indices are computed + from by shuffling the given dataset. + :return: subsampled dataset. + """ + num_samples = len(dataset) + n_obs = self.check_n_obs(n_obs, num_samples) + if indices is None: + indices = self.shuffled_indices(dataset) + indices = indices[:n_obs] + return dataset.select(indices) + + def load_dataset(self, split: int): + return datasets.load_dataset(self.name, self.config, split=split, script_version="master") + + def get_split_indices(self, split, dataset, validation_size): + indices = self.shuffled_indices(dataset) + if split == "validation": + return indices[:validation_size] + else: + return indices[validation_size:] + + def map_dataset(self, dataset, add_prefix): + return dataset.map(functools.partial(self.preprocessor, add_prefix=add_prefix), + remove_columns=dataset.column_names) + + def get(self, split, add_prefix=True, n_obs=None, split_validation_test=False): + # For small datasets (n_samples < 10K) without test set, we divide validation set to + # half, use one half as test set and one half as validation set. + if split_validation_test and self.name in self.small_datasets_without_all_splits \ + and split != "train": + mapped_split = self.split_to_data_split["validation"] + dataset = self.load_dataset(split=mapped_split) + indices = self.get_split_indices(split, dataset, validation_size=len(dataset)//2) + dataset = self.subsample(dataset, n_obs, indices) + # For larger datasets (n_samples > 10K), we divide training set into 1K as + # validation and the rest as training set, keeping the original validation + # set as the test set. + elif split_validation_test and self.name in self.large_data_without_all_splits \ + and split != "test": + dataset = self.load_dataset(split="train") + indices = self.get_split_indices(split, dataset, validation_size=1000) + dataset = self.subsample(dataset, n_obs, indices) + else: + mapped_split = self.split_to_data_split[split] + dataset = self.load_dataset(split=mapped_split) + # shuffles the data and samples it. + if n_obs is not None: + dataset = self.subsample(dataset, n_obs) + return self.map_dataset(dataset, add_prefix) + +class Squad(AbstractTask): + name = "squad" + metric = [metrics.squad] + + def load_dataset(self, split): + return datasets.load_dataset(self.name, split=split, script_version="master") + + def preprocessor(self, example, add_prefix): + answer = pad_punctuation(example['answers']['text'][0]) + question = pad_punctuation(example['question']) + context = pad_punctuation(example['context']) + source = ["question:", question, + "context:", context] + target = [answer] + return self.seq2seq_format(source, target, add_prefix) + + +class MRPC(AbstractTask): + name = "mrpc" + labels_list = ["0", "1"] + metric = [metrics.f1_score_with_invalid, metrics.accuracy] + metric_names = ["f1", "accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'mrpc', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["sentence1:", example['sentence1'], + "sentence2:", example["sentence2"]] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class COLA(AbstractTask): + name = "cola" + labels_list = ["0", "1"] + metric = [metrics.matthews_corrcoef] + metric_names = ["matthews_correlation"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'cola', + split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["sentence:", example['sentence']] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SST2(AbstractTask): + name = "sst2" + labels_list = ["0", "1"] + metric = [metrics.accuracy] + metric_names = ["accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'sst2', + split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["sentence:", example['sentence']] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class STSB(AbstractTask): + name = "stsb" + labels_list = [str(np.round(label, decimals=1)) for label in np.arange(0, 5.2, 0.2)] + metric = [metrics.pearson_corrcoef, metrics.spearman_corrcoef] + metric_names = ["pearson", "spearmanr"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'stsb', + split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["sentence1:", example['sentence1'], + "sentence2:", example["sentence2"]] + tgt_texts = [str(round_stsb_target(example['label']))] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class QQP(AbstractTask): + name = "qqp" + labels_list = ["0", "1"] + metric = [metrics.f1_score_with_invalid, metrics.accuracy] + metric_names = ["f1", "accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'qqp', + split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["question1:", example['question1'], + "question2:", example["question2"]] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class MNLI(AbstractTask): + name = "mnli" + labels_list = ["0", "1", "2"] + split_to_data_split = {"train": "train", + "validation": "validation_mismatched", + "test": "validation_matched"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'mnli', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["premise:", example['premise'], + "hypothesis", example["hypothesis"]] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class QNLI(AbstractTask): + name = "qnli" + labels_list = ["0", "1"] + metric = [metrics.accuracy] + metric_names = ["accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'qnli', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["question:", example['question'], + "sentence:", example["sentence"]] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + +class RTE(AbstractTask): + name = "rte" + labels_list = ["0", "1"] + metric = [metrics.accuracy] + metric_names = ["accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'rte', + split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["sentence1:", example['sentence1'], + "sentence2:", example["sentence2"]] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class WNLI(AbstractTask): + name = "wnli" + labels_list = ["0", "1"] + metric = [metrics.accuracy] + metric_names = ["accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'wnli', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["sentence1:", example['sentence1'], + "sentence2:", example["sentence2"]] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SuperGLUEBoolQ(AbstractTask): + name="superglue-boolq" + labels_list = ['0', '1'] + metric = [metrics.accuracy] + metric_names = ["accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'boolq', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["question:", example["question"], "passage:", example["passage"]] + tgt_texts = [str(example["label"])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SuperGLUERTE(AbstractTask): + name="superglue-rte" + labels_list = ['0', '1'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'rte', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["premise:", example["premise"], + "hypothesis:", example["hypothesis"]] + tgt_texts = [str(example["label"])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SuperGLUECB(AbstractTask): + name = "superglue-cb" + labels_list = ['0', '1', '2'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.mean_multiclass_f1(num_classes=3), metrics.accuracy] + metric_names = ["f1_multiclass", "accuracy"] + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'cb', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["premise:", example["premise"], "hypothesis:", example["hypothesis"]] + tgt_texts = [str(example["label"])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SuperGLUECOPA(AbstractTask): + name = "superglue-copa" + labels_list = ['0', '1'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'copa', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["premise:", example["premise"], + "choice1:", example["choice1"], + "choice2:", example["choice2"]] + tgt_texts = [str(example["label"])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SuperGLUEMultiRC(AbstractTask): + name = "superglue-multirc" + labels_list = ['0', '1'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.multirc_f1_over_all_answers, + metrics.mean_group_metric(metrics.exact_match)] + metric_names = ["f1", "em"] + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'multirc', split=split, script_version="master") + + def remove_markup(self, text): + """Removes the HTML markup.""" + text = re.sub('
', ' ', text) + text = re.sub('<(/)?b>', '', text) + return text + + def preprocessor(self, example, add_prefix=True): + group = example['idx']['question'] + # T5 applies remove_markup to the joined string, but this should not make + # any difference as well. + # https://github.com/google-research/text-to-text-transfer-transformer/blob/a1352e625db7ec114062f99d99b0565b9e45c155/t5/data/preprocessors.py#L797 + src_texts = ["question:", self.remove_markup(example["question"]), + "answer:", self.remove_markup(example["answer"]), + "paragraph:", self.remove_markup(example["paragraph"])] + tgt_texts = [str(example["label"])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix, extra_fields={"group": group}) + + + +class SuperGLUEWIC(AbstractTask): + name = "superglue-wic" + labels_list = ['0', '1'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'wic', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["sentence1:", example["sentence1"], + "sentence2:", example["sentence2"], + "word:", example["word"]] + tgt_texts = [str(example["label"])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SuperGLUEWSCFixed(AbstractTask): + # source: https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/data/preprocessors.py + """Convert WSC examples to text2text format. + WSC includes a sentence along with 2 'spans': the first denoting a noun and + the other a pronoun. The 'label' specifies whether or not the pronoun is + referencing the noun. This preprocessor puts ' * ' around the noun and ' # ' + around the pronoun. + For example, a typical example from WSC might look like + { + 'text': 'This is a test sentence .', + 'span1_text': 'test', + 'span1_index': 3, + 'span2_text': 'This', + 'span2_index': 0, + 'label': 0 + } + This example would be transformed to + { + 'inputs': 'wsc text: # This # is a * test * sentence .', + 'targets': 'False' + } + """ + name = "superglue-wsc.fixed" + labels_list = ['0', '1'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'wsc.fixed', split=split, script_version="master") + + def _mark_span(self, text, span_str, span_idx, mark): + pattern_tmpl = r'^((?:\S+\s){N})(W)' + pattern = re.sub('N', str(span_idx), pattern_tmpl) + pattern = re.sub('W', span_str, pattern) + return re.sub(pattern, r'\1{0} \2 {0}'.format(mark), text) + + def preprocessor(self, example, add_prefix=True): + # converts text as done in T5. + text = example['text'] + text = self._mark_span(text, example['span1_text'], example['span1_index'], '*') + # Compensate for 2 added "words" added in previous step. + span2_index = example['span2_index'] + 2 * int(example['span1_index'] < example['span2_index']) + text = self._mark_span(text, example['span2_text'], span2_index, '#') + src_texts = ["text:", text] + tgt_texts = [str(example["label"])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SuperGLUERecord(AbstractTask): + """Convert ReCoRD examples to text2text examples. + ReCoRD contains a passage, query containing a '@placeholder' string, and a set + of entities that are the possible values of the placeholder. Each train and + validation example will have a list of answers, any of which would be + considered correct. + For example, a typical example from ReCoRD might look like + { + 'passsage': 'This is the passage.', + 'query': 'A @placeholder is a bird.', + 'entities': ['penguin', 'potato', 'pigeon'], + 'answers': ['penguin', 'pigeon'], + } + which this preprocessor would turn into the following two examples: + { + 'inputs': 'record query: A @placeholder is a bird. entities: penguin, ' + 'potato, pigeon passage: This is the passage.', + 'targets': 'penguin', + } + and + { + 'inputs': 'record query: A @placeholder is a bird. entities: penguin, ' + 'potato, pigeon passage: This is the passage.', + 'targets': 'pigeon', + } + """ + name = "superglue-record" + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.squad] + metric_names = ["squad"] + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'record', split=split, script_version="master") + + def preprocessor(self, batch, add_prefix=True): + new_batch = collections.defaultdict(list) + keys = batch.keys() + for values in zip(*batch.values()): + ex = {k: v for k, v in zip(keys, values)} + # updates the passage. + passage = ex['passage'] + passage = re.sub(r'(\.|\?|\!|\"|\')\n@highlight\n', r'\1 ', passage) + passage = re.sub(r'\n@highlight\n', '. ', passage) + inputs = f"record query: {ex['query']} entities: {', '.join(ex['entities'])} passage: {passage}" + if add_prefix: + inputs = self.name + " " + inputs + # duplicates the samples based on number of answers. + num_answers = len(ex["answers"]) + num_duplicates = np.maximum(1, num_answers) + new_batch["source"].extend([inputs] * num_duplicates) + new_batch["target"].extend(ex["answers"] if num_answers > 0 else [""]) + new_batch["task"].extend([self.name] * num_duplicates) + new_batch["extra_fields"].extend([{"answers": ex["answers"]}]*num_duplicates) + return new_batch + + def map_dataset(self, dataset, add_prefix=True): + return dataset.map(functools.partial(self.preprocessor, add_prefix=add_prefix), + batched=True, remove_columns=dataset.column_names) + + +TASK_MAPPING = OrderedDict( + [ + ('squad', Squad), + ('mrpc', MRPC), + ('cola', COLA), + ('sst2', SST2), + ('qnli', QNLI), + ('rte', RTE), + ('wnli', WNLI), + ('mnli', MNLI), + ('qqp', QQP), + ('stsb', STSB), + ('superglue-boolq', SuperGLUEBoolQ), + ('superglue-rte', SuperGLUERTE), + ('superglue-cb', SuperGLUECB), + ('superglue-copa', SuperGLUECOPA), + ('superglue-multirc', SuperGLUEMultiRC), + ('superglue-wic', SuperGLUEWIC), + ('superglue-wsc.fixed', SuperGLUEWSCFixed), + ('superglue-record', SuperGLUERecord) + ] +) + +class AutoTask: + @classmethod + def get(self, task, config, seed=42): + if task in TASK_MAPPING: + return TASK_MAPPING[task](config, seed) + raise ValueError( + "Unrecognized task {} for AutoTask Model: {}.\n" + "Task name should be one of {}.".format( + ", ".join(c for c in TASK_MAPPING.keys()) + ) + ) diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/data_processors/utils.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/data_processors/utils.py new file mode 100644 index 0000000..1445974 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/data_processors/utils.py @@ -0,0 +1,17 @@ +import numpy as np + +def round_stsb_target(label): + """STSB maps two sentences to a floating point number between 1 and 5 + representing their semantic similarity. Since we are treating all tasks as + text-to-text tasks we need to convert this floating point number to a string. + The vast majority of the similarity score labels in STSB are in the set + [0, 0.2, 0.4, ..., 4.8, 5.0]. So, we first round the number to the closest + entry in this set, and then we convert the result to a string (literally e.g. + "3.4"). This converts STSB roughly into a 26-class classification dataset. + Args: + label: original label. + Returns: + A preprocessed label. + """ + return np.round((label * 5) / 5, decimals=1) + diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/metrics/__init__.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/metrics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/metrics/metrics.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/metrics/metrics.py new file mode 100644 index 0000000..1dfa865 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/metrics/metrics.py @@ -0,0 +1,173 @@ +# several of the evaluation metrics are from https://github.com/google-research/text-to-text-transfer-transformer/blob/a1352e625db7ec114062f99d99b0565b9e45c155/t5/evaluation/metrics.py +"""Defines different metrics used for evaluation of tasks.""" +import numpy as np +import scipy +import math +import sklearn +import collections +from logging import getLogger +from .qa_utils import normalize_squad, qa_metrics +import sklearn.metrics + +logger = getLogger(__name__) + +def accuracy(predictions, targets) -> dict: + """Computes the average accuracy.""" + return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())} + +def pearson_corrcoef(predictions, targets) -> dict: + """Computes Pearson correlation coefficient.""" + from examples_seq2seq.data_processors.postprocessors import string_to_float + targets = [string_to_float(target) for target in targets] + predictions= [string_to_float(prediction) for prediction in predictions] + pearson_corrcoef = 100 * scipy.stats.pearsonr(targets, predictions)[0] + + # Note that if all the predictions will be the same, spearman + # correlation is nan, to gaurad against this, we check the output + # and return 0 in this case. + if math.isnan(pearson_corrcoef): + pearson_corrcoef = 0 + return {"pearson": pearson_corrcoef} + + +def spearman_corrcoef(predictions, targets) -> dict: + """Computes Spearman correlation coefficient.""" + # TODO: we need to do postprocessors in a clean way for each dataset. + from examples_seq2seq.data_processors.postprocessors import string_to_float + targets = [string_to_float(target) for target in targets] + predictions= [string_to_float(prediction) for prediction in predictions] + spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0] + + # Note that if all the predictions will be the same, spearman + # correlation is nan, to gaurad against this, we check the output + # and return 0 in this case. + if math.isnan(spearman_corrcoef): + spearman_corrcoef = 0 + return {"spearmanr": spearman_corrcoef} + + +def f1_score_with_invalid(predictions, targets) -> dict: + """Computes F1 score, with any prediction != 0 or 1 is counted as incorrect. + Args: + targets: list of targets, either 0 or 1 + predictions: list of predictions, any integer value + Returns: + F1 score, where any prediction != 0 or 1 is counted as wrong. + """ + def binary_reverse(labels): + return ['0' if label == '1' else '1' for label in labels] + targets, predictions = np.asarray(targets), np.asarray(predictions) + # Get indices of invalid predictions. + invalid_idx_mask = np.logical_and(predictions != '0', predictions != '1') + # For any prediction != 0 or 1, we set the prediction to the opposite of its corresponding target. + predictions[invalid_idx_mask] = binary_reverse(targets[invalid_idx_mask]) + targets = targets.astype(np.int32) + predictions = predictions.astype(np.int32) + return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)} + +# TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow +def matthews_corrcoef(predictions, targets) -> dict: + """Computes the Matthews correlation coefficient.""" + return {"matthews_correlation": 100 * sklearn.metrics.matthews_corrcoef(targets, predictions)} + +def squad(predictions, targets): + """Computes SQuAD metrics, maximizing over answers per question. + Args: + targets: list of lists of strings + predictions: list of strings + Returns: + dict with score_key: squad score across all targets and predictions + """ + + targets = [[normalize_squad(t) for t in u] for u in targets] + predictions = [normalize_squad(p) for p in predictions] + return qa_metrics(targets, predictions) + + +def exact_match(predictions, targets): + """Computes whether the targets match predictions exactly.""" + return {"em": 100 * float(np.array_equal(targets, predictions))} + + +def sklearn_metrics_wrapper(metric_str, + metric_dict_str=None, + metric_post_process_fn=None, + **metric_fn_kwargs): + """Wraps any sklearn.metric function and returns a t5 metric function. + Args: + metric_str: string, the function from `sklearn.metrics` to use. + metric_dict_str: optional string, if not specified `metric_str` is used as + the key in the returned dictionary. + metric_post_process_fn: callable, if specified the final computed metric + will be passed through this. + **metric_fn_kwargs: kwargs, passed to the metric function we are calling. + Returns: + the function that calculates the metric in a dict. + """ + if not hasattr(sklearn.metrics, metric_str): + raise ValueError("sklearn.metrics does not have: %s" % metric_str) + + def fn(predictions, targets): + metric_fn = getattr(sklearn.metrics, metric_str) + metric_val = metric_fn(targets, predictions, **metric_fn_kwargs) + if metric_post_process_fn is not None: + metric_val = metric_post_process_fn(metric_val) + return {metric_dict_str or metric_str: metric_val} + return fn + + +def mean_multiclass_f1(num_classes, **metric_fn_kwargs): + """Computes the unweighted average of the F1 per class.""" + return sklearn_metrics_wrapper( + "fbeta_score", + metric_dict_str="f1_multiclass", + metric_post_process_fn=lambda x: 100 * x, + beta=1, + labels=range(num_classes), + average="macro", + **metric_fn_kwargs) + + +def multirc_f1_over_all_answers(targets, predictions): + """Special metric for MultiRC which computes F1 score over all examples. + This is necessary because the targets/predictions for MultiRC are dicts and + the f1_score_with_invalid expects a list of True/False labels, not dicts. As + a result we just need to key in the "value" for each of the example dicts + before feeding into f1_score_with_invalid. + Args: + targets: list of dicts, where each dict has a "value" key. + predictions: list of dicts, where each dict has a "value" key. + Returns: + F1 score over values, where any prediction != 0 or 1 is counted as wrong. + """ + return f1_score_with_invalid( + [t["value"] for t in targets], [p["value"] for p in predictions] + ) + + +def mean_group_metric(metric_fn, group_key="group", value_key="value"): + """Returns a metric that averages `metric_fn` on sub-groups of results. + The sub-groups are defined by aggregating results (targets and predictions) + by accessing the feature specified by `group_key` in the target dicts. + **WARNING**: Using this function can produce unreliable results if you do not + pass in full groups. For example, if you evaluate over a random subsample of a + validation set and do not retain all of the examples in each group, you may + get results which aren't directly comparable to using the full validation set. + Args: + metric_fn: function, the metric to compute on the subgroups. + group_key: string, the key for the grouping value in the target dictionary. + value_key: string, the key for the value in the dictionaries. + """ + def my_metric(targets, predictions): + """Computes mean of `metric_fn` over subgroups of results.""" + grouped_values = collections.defaultdict(lambda: ([], [])) + for targ, pred in zip(targets, predictions): + g = targ[group_key] + grouped_values[g][0].append(targ[value_key]) + grouped_values[g][1].append(pred[value_key]) + group_scores = collections.defaultdict(list) + for (targets, predictions) in grouped_values.values(): + for metric, score in metric_fn(targets, predictions).items(): + group_scores[metric].append(score) + return {metric: np.mean(scores) for metric, scores in group_scores.items()} + return my_metric \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/metrics/qa_utils.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/metrics/qa_utils.py new file mode 100644 index 0000000..fe3fb0c --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/metrics/qa_utils.py @@ -0,0 +1,96 @@ +# Copyright 2021 The T5 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# source: the codes are from https://github.com/google-research/text-to-text-transfer-transformer +"""Utilities for Question Answering (QA) evaluation. +Matches results on the SQuAD (v1.1) and TriviaQA (v1.0) evaluation scripts. +""" + +import collections +import string +import regex as re +import numpy as np + + +def _normalize_answer(text, punc_chars, punc_repl): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(s): + return re.sub(r"\b(a|an|the)\b", " ", s) + + def replace_punctuation(s): + to_replace = set(punc_chars) + return "".join(punc_repl if ch in to_replace else ch for ch in s) + + def white_space_fix(s): + return " ".join(s.split()) + + text = text.lower() + text = replace_punctuation(text) + text = remove_articles(text) + text = white_space_fix(text) + return text + + +def normalize_trivia_qa(answer): + """Normalization used in official TriviaQA evaluation script.""" + return _normalize_answer( + answer, punc_chars=string.punctuation + "‘’´`_", punc_repl=" ").strip() + + +def normalize_squad(answer): + """Normalization used in official SQuAD evaluation script.""" + return _normalize_answer(answer, punc_chars=string.punctuation, punc_repl="") + + +def _metric_max_over_ground_truths(metric_fn, ground_truths, prediction): + """Computes the maximum of the metric over all ground truths.""" + return max( + metric_fn(ground_truth, prediction) for ground_truth in ground_truths + ) + + +def _exact_match_score(target, prediction): + return target == prediction + + +def _f1_score(target, prediction): + """Computes token f1 score for a single target and prediction.""" + prediction_tokens = prediction.split() + target_tokens = target.split() + common = (collections.Counter(prediction_tokens) & + collections.Counter(target_tokens)) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(target_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def qa_metrics(targets, predictions): + """Computes exact match and f1 QA scores, expecting pre-normalized text.""" + if len(targets) != len(predictions): + raise ValueError("Number of targets and predictions must match.") + em = np.mean([ + _metric_max_over_ground_truths(_exact_match_score, t, p) + for p, t in zip(predictions, targets) + ]) + f1 = np.mean([ + _metric_max_over_ground_truths(_f1_score, t, p) + for p, t in zip(predictions, targets) + ]) + em *= 100 + f1 *= 100 + return {"em": em, "f1": f1} diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/run_seq2seq.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/run_seq2seq.py new file mode 100644 index 0000000..defa655 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/run_seq2seq.py @@ -0,0 +1,510 @@ +# coding=utf-8 +# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for sequence to sequence. +""" +# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments. +import functools +import logging +# from opendelta.utils.delta_center import create_hub_repo_name +import torch +import os +os.environ['MKL_THREADING_LAYER'] = 'GNU' +os.environ['MKL_SERVICE_FORCE_INTEL'] = '1' +import sys +import subprocess +from typing import Optional, List + +from datasets import load_dataset, load_metric, concatenate_datasets +import transformers +from transformers import ( + AutoConfig, + AutoModelForSeq2SeqLM, + AutoTokenizer, + HfArgumentParser, + MBartTokenizer, + default_data_collator, + set_seed, +) +from transformers.trainer_utils import is_main_process, get_last_checkpoint +# from ..seq2seq.utils import get_adapter_config +from examples_seq2seq.data_processors import AutoTask, TaskDataCollatorForSeq2Seq, AutoPostProcessor +from examples_seq2seq.seq2seq_trainer import Seq2SeqTrainer +# from training_args import AdapterTrainingArguments +from examples_seq2seq.trainers.trainer_utils import save_training_config +from dataclasses import dataclass, field + +from transformers.models.t5.modeling_t5 import T5Config, T5ForConditionalGeneration +from examples_seq2seq.trainers.model_args import ModelArguments +from examples_seq2seq.trainers.trainer_args import TrainingArguments, DataTrainingArguments + +import tensorboardX +tb_writer = tensorboardX.SummaryWriter("Delta_Memory") + +logger = logging.getLogger(__name__) + +def run_command(command): + output = subprocess.getoutput(command) + return output + + +TASK_TO_METRICS = {"mrpc": ["accuracy", "f1"], + "cola": ['matthews_correlation'], + "stsb": ['pearson', 'spearmanr'], + 'sst2': ['accuracy'], + "mnli": ["accuracy"], + "mnli_mismatched": ["accuracy"], + "mnli_matched": ["accuracy"], + "qnli": ["accuracy"], + "rte": ["accuracy"], + "wnli": ["accuracy"], + "qqp": ["accuracy", "f1"], + "superglue-boolq": ["accuracy"], + "superglue-rte": ["accuracy"], + "superglue-cb": ["f1_multiclass", "accuracy"], + "superglue-copa": ["accuracy"], + "superglue-multirc": ["f1", "em"], + "superglue-wic": ["accuracy"], + "superglue-wsc.fixed": ["accuracy"], + "superglue-record": ["f1", "em"] + } + + +class RemainArgHfArgumentParser(HfArgumentParser): + def parse_json_file(self, json_file: str, return_remaining_args=True ): + """ + Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the + dataclass types. + """ + import argparse + import json + from pathlib import Path + import dataclasses + + data = json.loads(Path(json_file).read_text()) + outputs = [] + for dtype in self.dataclass_types: + keys = {f.name for f in dataclasses.fields(dtype) if f.init} + inputs = {k: data.pop(k) for k in list(data.keys()) if k in keys} + obj = dtype(**inputs) + outputs.append(obj) + + remain_args = argparse.ArgumentParser() + remain_args.__dict__.update(data) + if return_remaining_args: + return (*outputs, remain_args) + else: + return (*outputs,) + +# from transformers.trainer_callback import TrainerCallback + +# class MyCallback(TrainerCallback): +# def __init__(self, *args, **kwargs): +# self.delta_args = kwargs.pop("delta_args") +# self.trainer_args = kwargs.pop("trainer_args") +# self.model_args = kwargs.pop("model_args") +# super(MyCallback, self).__init__(*args, **kwargs) + + +# maxcudamem = 0 +# def on_step_end(self, args, state, control, **kwargs ): +# glb_step = state.global_step +# cudamem = 0 +# realcudamem =0 +# for device_id in range(torch.cuda.device_count()): +# cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3 +# realcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3 +# torch.cuda.reset_peak_memory_stats(f"cuda:{device_id}") +# self.maxcudamem = max(self.maxcudamem, realcudamem) +# self.cudamem = cudamem +# # self.tb_writer.add_scalar("Static Memory (GB)", cudamem, glb_step) + # self.tb_writer.add_scalar("Runtime Memory (GB)", realcudamem, glb_step) + # self.tb_writer.add_scalar("Peak Memory (GB)", self.maxcudamem, glb_step) + # if glb_step > 50: + # content = f"{self.delta_args.delta_type}\t{self.trainer_args.per_device_train_batch_size}\t{self.model_args.model_name_or_path}\t{self.cudamem}\t{self.maxcudamem}\n" + # with open("memory_data.txt", 'a') as fout: + # fout.write(content) + # exit() + + + + + + + + + +def main(): + + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args, delta_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args, delta_args = parser.parse_args_into_dataclasses() + + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + print("#### last_checkpoint ", last_checkpoint) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + ''' + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + ''' + pass + elif last_checkpoint is not None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + # Set the verbosity to info of the Transformers logger (on main process only): + if is_main_process(training_args.local_rank): + transformers.utils.logging.set_verbosity_info() + logger.info("Training/evaluation parameters %s", training_args) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files in the summarization task, this script will use the first column for the full texts and the + # second column for the summaries (unless you specify column names for this with the `text_column` and + # `summary_column` arguments). + # For translation, only JSON files are supported, with one field named "translation" containing two keys for the + # source and target languages (unless you adapt what follows). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + model = AutoModelForSeq2SeqLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + model.resize_token_embeddings(len(tokenizer)) + + + if delta_args.delta_type.lower() != "none": + from opendelta import AutoDeltaConfig,AutoDeltaModel + delta_config = AutoDeltaConfig.from_dict(vars(delta_args)) + delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=model) + delta_model.freeze_module(set_state_dict = True) + delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True) + + + # model parallelize + # if hasattr(training_args, "model_parallel") and training_args.model_parallel: + # logger.info('parallelize model!') + model.parallelize() + + data_args.dataset_name = [data_args.task_name] + data_args.eval_dataset_name = [data_args.eval_dataset_name] + data_args.test_dataset_name = [data_args.test_dataset_name] + data_args.dataset_config_name = [data_args.dataset_config_name] + data_args.eval_dataset_config_name = [data_args.eval_dataset_config_name] + data_args.test_dataset_config_name = [data_args.test_dataset_config_name] + assert len(data_args.dataset_name) == len(data_args.dataset_config_name) + if data_args.eval_dataset_name is not None: + assert len(data_args.eval_dataset_name) == len(data_args.eval_dataset_config_name) + if data_args.test_dataset_name is not None: + assert len(data_args.test_dataset_name) == len(data_args.test_dataset_config_name) + + # Temporarily set max_target_length for training. + #max_target_length = data_args.max_target_length + padding = "max_length" if data_args.pad_to_max_length else False + + def preprocess_function(examples, max_target_length): + # max_target_length += 1 + # model_inputs = tokenizer([s+"" for s in examples['source']], max_length=data_args.max_source_length, + # padding=padding, truncation=True) + # # Setup the tokenizer for targets + # with tokenizer.as_target_tokenizer(): + # labels = tokenizer([''+t for t in examples['target']], max_length=max_target_length, padding=padding, truncation=True) + model_inputs = tokenizer([s for s in examples['source']], max_length=data_args.max_source_length, + padding=padding, truncation=True) + # Setup the tokenizer for targets + with tokenizer.as_target_tokenizer(): + labels = tokenizer([t for t in examples['target']], max_length=max_target_length, padding=padding, truncation=True) + # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore + # padding in the loss. + if padding == "max_length" and data_args.ignore_pad_token_for_loss: + labels["input_ids"] = [ + [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] + ] + model_inputs["labels"] = labels["input_ids"] + model_inputs["extra_fields"] = examples['extra_fields'] + return model_inputs + + column_names = ['source', 'target', 'extra_fields'] + performance_metrics = {} + if training_args.do_train: + train_datasets = [AutoTask.get(dataset_name, + dataset_config_name, + seed=data_args.data_sample_seed).get( + split="train", + split_validation_test=training_args.split_validation_test, + add_prefix=True, + n_obs=data_args.max_train_samples) + for dataset_name, dataset_config_name\ + in zip(data_args.dataset_name, data_args.dataset_config_name)] + max_target_lengths = [AutoTask.get(dataset_name, dataset_config_name).get_max_target_length(\ + tokenizer=tokenizer, default_max_length=data_args.max_target_length)\ + for dataset_name, dataset_config_name in zip(data_args.dataset_name, data_args.dataset_config_name)] + for i, train_dataset in enumerate(train_datasets): + train_datasets[i] = train_datasets[i].map( + functools.partial(preprocess_function, max_target_length=max_target_lengths[i]), + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, # if train_dataset != "superglue-record" else column_names+["answers"], + load_from_cache_file=not data_args.overwrite_cache, + ) + train_dataset = concatenate_datasets(train_datasets) + + if training_args.do_eval: + eval_datasets = {eval_dataset: AutoTask.get(eval_dataset, eval_dataset_config, + seed=data_args.data_sample_seed).get( + split="validation", + split_validation_test=training_args.split_validation_test, + add_prefix=True, + n_obs=data_args.max_val_samples) + for eval_dataset, eval_dataset_config in zip(data_args.eval_dataset_name, data_args.eval_dataset_config_name)} + max_target_lengths = [AutoTask.get(dataset_name, dataset_config_name).get_max_target_length( \ + tokenizer=tokenizer, default_max_length=data_args.max_target_length) \ + for dataset_name, dataset_config_name in zip(data_args.eval_dataset_name, data_args.eval_dataset_config_name)] + for k, name in enumerate(eval_datasets): + eval_datasets[name] = eval_datasets[name].map( + functools.partial(preprocess_function, max_target_length=max_target_lengths[k]), + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, # if name != "superglue-record" else column_names+["answers"], + load_from_cache_file=not data_args.overwrite_cache, + ) + + if training_args.do_test: + test_datasets = {test_dataset: AutoTask.get(test_dataset, test_dataset_config, + seed=data_args.data_sample_seed).get( + split="test", + split_validation_test=training_args.split_validation_test, + add_prefix=True, + n_obs=data_args.max_test_samples) + for test_dataset, test_dataset_config in zip(data_args.test_dataset_name, data_args.test_dataset_config_name)} + max_target_lengths = [AutoTask.get(dataset_name, dataset_config_name).get_max_target_length( \ + tokenizer=tokenizer, default_max_length=data_args.max_target_length) \ + for dataset_name, dataset_config_name in zip(data_args.test_dataset_name, data_args.test_dataset_config_name)] + for k, name in enumerate(test_datasets): + test_datasets[name] = test_datasets[name].map( + functools.partial(preprocess_function, max_target_length=max_target_lengths[k]), + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + ) + + # Data collator + label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id + if data_args.pad_to_max_length: + data_collator = default_data_collator + else: + data_collator = TaskDataCollatorForSeq2Seq( + tokenizer, + label_pad_token_id=label_pad_token_id, + pad_to_multiple_of=8 if training_args.fp16 else None, + ) + + + # Metric, we assume we have only one training task. + eval_metrics = [AutoTask.get(dataset_name, dataset_config_name).metric\ + for dataset_name, dataset_config_name in zip(data_args.dataset_name, data_args.dataset_config_name)][0] + + # Extracts the extra information needed to evaluate on each dataset. + # These information are only used in the compute_metrics. + # We will assume that the test/eval dataloader does not change the order of + # the data. + data_info = {"eval": eval_datasets[data_args.eval_dataset_name[0]]['extra_fields'], + "test": test_datasets[data_args.test_dataset_name[0]]['extra_fields'], + "train": train_dataset['extra_fields']} + def compute_metrics(eval_preds): + preds, labels, data_info = eval_preds + post_processor = AutoPostProcessor.get(data_args.dataset_name[0], tokenizer, + data_args.ignore_pad_token_for_loss) + decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info) + result = {} + for metric in eval_metrics: + result.update(metric(decoded_preds, decoded_labels)) + return result + + + # Initialize our Trainer + trainer = Seq2SeqTrainer( + model=model, + args=training_args, + delta_args=delta_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=list(eval_datasets.values())[0] if training_args.do_eval else None, + data_info = data_info, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics if training_args.predict_with_generate else None, + evaluation_metrics = TASK_TO_METRICS[data_args.dataset_name[0]], + ) + + # trainer.add_callback(MyCallback(trainer_args=training_args, delta_args=delta_args, model_args=model_args)) + + + # Saves training config. + if trainer.is_world_process_zero(): + os.makedirs(training_args.output_dir, exist_ok=True) + save_training_config(sys.argv[1], training_args.output_dir) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + + if training_args.compute_time: + torch.cuda.synchronize() # wait for move to complete + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + + train_result = trainer.train(resume_from_checkpoint=checkpoint) + + if training_args.compute_time: + end.record() + torch.cuda.synchronize() # wait for all_reduce to complete + total_time = start.elapsed_time(end)/(1000*60) + performance_metrics.update({"total_time in minutes ": total_time}) + + trainer.save_model() # Saves the tokenizer too for easy upload + train_metrics = train_result.metrics + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + train_metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + trainer.log_metrics("train", train_metrics) + trainer.save_metrics("train", train_metrics) + trainer.save_state() + + if torch.cuda.is_available() and training_args.compute_memory: + peak_memory = (torch.cuda.max_memory_allocated() / 1024 ** 2)/1000 + print( + "Memory utilization", + peak_memory, + "GB" + ) + performance_metrics.update({"peak_memory": peak_memory}) + if training_args.compute_memory or training_args.compute_time: + print(performance_metrics) + trainer.save_metrics("performance", performance_metrics) + + # Evaluation + results = {} + if training_args.do_eval: + logger.info("*** Evaluate ***") + for task, eval_dataset in eval_datasets.items(): + metrics = trainer.evaluate(eval_dataset=eval_dataset, + max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, + ) + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + results['evaluate'] = metrics + + # Test + if training_args.do_test: + logger.info("*** Test ***") + for task, test_dataset in test_datasets.items(): + metrics = trainer.evaluate(eval_dataset=test_dataset, + max_length=data_args.test_max_target_length, num_beams=data_args.num_beams, + metric_key_prefix="test" + ) + trainer.log_metrics("test", metrics) + trainer.save_metrics("test", metrics) + results['test'] = metrics + + repo_name = create_hub_repo_name(root="DeltaHub", + dataset=data_args.task_name, + delta_type = delta_args.delta_type, + model_name_or_path= model_args.model_name_or_path) + results['repo_name'] = repo_name + if training_args.push_to_hub: # TODO add description here + delta_model.save_finetuned(push_to_hub=True, save_directory=repo_name, use_auth_token=True) + # trainer.push_to_hub(**kwargs) + else: + delta_model.save_finetuned(push_to_hub=False, save_directory=repo_name, use_auth_token=True) + + return results + + + + +if __name__ == "__main__": + result = main() + import json + with open("collect_result.jsonl", 'a') as fout: + string = json.dumps(result, indent=4,sort_keys=True) + fout.write(string+"\n") + print(result) diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/seq2seq_trainer.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/seq2seq_trainer.py new file mode 100644 index 0000000..e557844 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/seq2seq_trainer.py @@ -0,0 +1,114 @@ +from packaging import version +import torch +from torch import nn +from typing import Any, Dict, List, Optional, Tuple, Union + +from torch.utils.data.dataset import Dataset +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainner +from examples_seq2seq.trainers.trainer import BaseTrainer + + +from transformers.optimization import Adafactor, AdamW, get_scheduler +from transformers.trainer_pt_utils import get_parameter_names, is_sagemaker_mp_enabled +from transformers.integrations import is_fairscale_available + + + +if version.parse(torch.__version__) >= version.parse("1.6"): + from torch.cuda.amp import autocast + + +class Seq2SeqTrainer(HfSeq2SeqTrainner, BaseTrainer): + def __init__(self, train_dataset_sizes=None, delta_args=None, *args, **kwargs): + super().__init__(*args, **kwargs) + self.train_dataset_sizes = train_dataset_sizes + self.delta_args = delta_args + + def evaluate( + self, + eval_dataset: Optional[Dict[str, Dataset]] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + max_length: Optional[int] = None, + num_beams: Optional[int] = None, + ) -> Dict[str, float]: + # TODO: this also needs to be set per dataset + self._max_length = max_length + self._num_beams = num_beams + return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) + + + def prediction_step( + self, + model: nn.Module, + inputs: Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + gen_kwargs = { + "max_length": self._max_length if self._max_length is not None else self.model.config.max_length, + "num_beams": self._num_beams if self._num_beams is not None else self.model.config.num_beams, + } + generated_tokens = self.model.generate( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + **gen_kwargs, + ) + # in case the batch is shorter than max length, the output should be padded + if generated_tokens.shape[-1] < gen_kwargs["max_length"]: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + + with torch.no_grad(): + if self.use_amp: + with autocast(): + outputs = model(**inputs) + else: + outputs = model(**inputs) + if has_labels: + if self.label_smoother is not None: + loss = self.label_smoother(outputs, inputs["labels"]).mean().detach() + else: + loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach() + else: + loss = None + + if self.args.prediction_loss_only: + return (loss, None, None) + + labels = inputs["labels"] + if labels.shape[-1] < gen_kwargs["max_length"]: + labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + + return (loss, generated_tokens, labels) + + + + + diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/__init__.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/__init__.py new file mode 100644 index 0000000..8a0a403 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/__init__.py @@ -0,0 +1,2 @@ +from .trainer import BaseTrainer +from .seq2seq_trainer import Seq2SeqTrainer diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/model_args.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/model_args.py new file mode 100644 index 0000000..35e7785 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/model_args.py @@ -0,0 +1,36 @@ +from dataclasses import dataclass, field +from typing import Optional, List + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/seq2seq_trainer.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/seq2seq_trainer.py new file mode 100644 index 0000000..d6a2b80 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/seq2seq_trainer.py @@ -0,0 +1,108 @@ +from packaging import version +import torch +from torch import nn +from typing import Any, Dict, List, Optional, Tuple, Union + +from torch.utils.data.dataset import Dataset +from transformers import Seq2SeqTrainer +from .trainer import BaseTrainer + + +if version.parse(torch.__version__) >= version.parse("1.6"): + from torch.cuda.amp import autocast + + +class Seq2SeqTrainer(Seq2SeqTrainer, BaseTrainer): + def __init__(self, train_dataset_sizes=None, delta_args=None, *args, **kwargs): + super().__init__(*args, **kwargs) + self.train_dataset_sizes = train_dataset_sizes + self.delta_args = delta_args + + def evaluate( + self, + eval_dataset: Optional[Dict[str, Dataset]] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + max_length: Optional[int] = None, + num_beams: Optional[int] = None, + ) -> Dict[str, float]: + # TODO: this also needs to be set per dataset + self._max_length = max_length + self._num_beams = num_beams + return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) + + + def prediction_step( + self, + model: nn.Module, + inputs: Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + gen_kwargs = { + "max_length": self._max_length if self._max_length is not None else self.model.config.max_length, + "num_beams": self._num_beams if self._num_beams is not None else self.model.config.num_beams, + } + generated_tokens = self.model.generate( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + **gen_kwargs, + ) + # in case the batch is shorter than max length, the output should be padded + if generated_tokens.shape[-1] < gen_kwargs["max_length"]: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + + with torch.no_grad(): + if self.use_amp: + with autocast(): + outputs = model(**inputs) + else: + outputs = model(**inputs) + if has_labels: + if self.label_smoother is not None: + loss = self.label_smoother(outputs, inputs["labels"]).mean().detach() + else: + loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach() + else: + loss = None + + if self.args.prediction_loss_only: + return (loss, None, None) + + labels = inputs["labels"] + if labels.shape[-1] < gen_kwargs["max_length"]: + labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + + return (loss, generated_tokens, labels) + + + + + diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/trainer.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/trainer.py new file mode 100644 index 0000000..304e32b --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/trainer.py @@ -0,0 +1,274 @@ +from typing import Dict, List, Optional +import numpy as np +import time +import torch +import collections +from packaging import version +from torch.utils.data.dataset import Dataset + +from transformers import Trainer +from transformers import logging +from transformers.trainer_utils import ( + speed_metrics, + EvalLoopOutput, + denumpify_detensorize +) +from transformers.file_utils import is_torch_tpu_available +from transformers.trainer_pt_utils import ( + find_batch_size, + nested_numpify, + nested_truncate, + nested_concat, + IterableDatasetShard +) +from .trainer_utils import EvalPrediction + + +from torch.utils.data.dataloader import DataLoader +from torch.utils.data.dataset import IterableDataset +from transformers.deepspeed import deepspeed_init + + +if version.parse(torch.__version__) >= version.parse("1.6"): + from torch.cuda.amp import autocast + +if is_torch_tpu_available(): + import torch_xla.core.xla_model as xm + import torch_xla.debug.metrics as met + import torch_xla.distributed.parallel_loader as pl + +logger = logging.get_logger(__name__) + +class BaseTrainer(Trainer): + def __init__(self, evaluation_metrics=[], data_info=None, *args, **kwargs): + """When doing evaluation, it computes average of list of metrics + given in evaluation_metrics and adds it to the dictionary of results. + Trainer class then use this average metric to save the best model.""" + super().__init__(*args, **kwargs) + self.evaluation_metrics = evaluation_metrics + self.data_info = data_info + + def get_data_info(self, metric_key_prefix): + """Returns the data information required to make the predictions/labels + suitable for the evaluation.""" + if self.data_info is not None: + return self.data_info[metric_key_prefix] + return None + + def evaluate( + self, + eval_dataset: Optional[Dataset] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + ) -> Dict[str, float]: + """ + Run evaluation and returns metrics. + The calling script will be responsible for providing a method to compute metrics, as they are task-dependent + (pass it to the init :obj:`compute_metrics` argument). + You can also subclass and override this method to inject custom behavior. + Args: + eval_dataset (:obj:`Dataset`, `optional`): + Pass a dataset if you wish to override :obj:`self.eval_dataset`. If it is an :obj:`datasets.Dataset`, + columns not accepted by the ``model.forward()`` method are automatically removed. It must implement the + :obj:`__len__` method. + ignore_keys (:obj:`Lst[str]`, `optional`): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + metric_key_prefix (:obj:`str`, `optional`, defaults to :obj:`"eval"`): + An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named + "eval_bleu" if the prefix is "eval" (default) + Returns: + A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The + dictionary also contains the epoch number which comes from the training state. + """ + # memory metrics - must set up as early as possible + self._memory_tracker.start() + eval_dataloader = self.get_eval_dataloader(eval_dataset) + start_time = time.time() + eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop + output = eval_loop( + eval_dataloader, + description="Evaluation", + # No point gathering the predictions if there are no metrics, otherwise we defer to + # self.args.prediction_loss_only + prediction_loss_only=True if self.compute_metrics is None else None, + ignore_keys=ignore_keys, + metric_key_prefix=metric_key_prefix, + ) + output.metrics.update(speed_metrics(metric_key_prefix, start_time, output.num_samples)) + if len(self.evaluation_metrics) != 0: + selected_metrics = [output.metrics[metric_key_prefix+"_"+k] for k in self.evaluation_metrics if metric_key_prefix+"_"+k in output.metrics] + assert len(selected_metrics) >= 1, "at least one metric should be selected to compute the average_metrics." + output.metrics.update({metric_key_prefix+'_average_metrics': np.mean(selected_metrics)}) + + self.log(output.metrics) + + if self.args.tpu_metrics_debug or self.args.debug: + # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) + xm.master_print(met.metrics_report()) + + self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics) + self._memory_tracker.stop_and_update_metrics(output.metrics) + return output.metrics + + def evaluation_loop( + self, + dataloader: DataLoader, + description: str, + prediction_loss_only: Optional[bool] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + ) -> EvalLoopOutput: + """ + Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`. + + Works both with or without labels. + """ + prediction_loss_only = ( + prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only + ) + + # if eval is called w/o train init deepspeed here + if self.args.deepspeed and not self.deepspeed: + + # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval + # from the checkpoint eventually + deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None) + self.model = deepspeed_engine.module + self.model_wrapped = deepspeed_engine + self.deepspeed = deepspeed_engine + # XXX: we don't need optim/sched for inference, but this needs to be sorted out, since + # for example the Z3-optimizer is a must for zero3 to work even for inference - what we + # don't need is the deepspeed basic optimizer which is self.optimizer.optimizer + deepspeed_engine.optimizer.optimizer = None + deepspeed_engine.lr_scheduler = None + + model = self._wrap_model(self.model, training=False) + + # if full fp16 is wanted on eval and this ``evaluation`` or ``predict`` isn't called while + # ``train`` is running, halve it first and then put on device + if not self.is_in_train and self.args.fp16_full_eval: + model = model.half().to(self.args.device) + + batch_size = dataloader.batch_size + + logger.info(f"***** Running {description} *****") + if isinstance(dataloader.dataset, collections.abc.Sized): + logger.info(f" Num examples = {self.num_examples(dataloader)}") + else: + logger.info(" Num examples: Unknown") + logger.info(f" Batch size = {batch_size}") + + model.eval() + + self.callback_handler.eval_dataloader = dataloader + # Do this before wrapping. + eval_dataset = dataloader.dataset + + if is_torch_tpu_available(): + dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) + + if self.args.past_index >= 0: + self._past = None + + # Initialize containers + # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps) + losses_host = None + preds_host = None + labels_host = None + # losses/preds/labels on CPU (final containers) + all_losses = None + all_preds = None + all_labels = None + # Will be useful when we have an iterable dataset so don't know its length. + + observed_num_examples = 0 + # Main evaluation loop + for step, inputs in enumerate(dataloader): + # Update the observed num examples + observed_batch_size = find_batch_size(inputs) + if observed_batch_size is not None: + observed_num_examples += observed_batch_size + + # Prediction step + loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) + # Update containers on host + if loss is not None: + losses = self._nested_gather(loss.repeat(batch_size)) + losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0) + if logits is not None: + logits = self._pad_across_processes(logits) + logits = self._nested_gather(logits) + preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) + if labels is not None: + labels = self._pad_across_processes(labels) + labels = self._nested_gather(labels) + labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) + self.control = self.callback_handler.on_prediction_step(self.args, self.state, self.control) + + # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. + if self.args.eval_accumulation_steps is not None and (step + 1) % self.args.eval_accumulation_steps == 0: + if losses_host is not None: + losses = nested_numpify(losses_host) + all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) + if preds_host is not None: + logits = nested_numpify(preds_host) + all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) + if labels_host is not None: + labels = nested_numpify(labels_host) + all_labels = ( + labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) + ) + + # Set back to None to begin a new accumulation + losses_host, preds_host, labels_host = None, None, None + + if self.args.past_index and hasattr(self, "_past"): + # Clean the state at the end of the evaluation loop + delattr(self, "_past") + + # Gather all remaining tensors and put them back on the CPU + if losses_host is not None: + losses = nested_numpify(losses_host) + all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) + if preds_host is not None: + logits = nested_numpify(preds_host) + all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) + if labels_host is not None: + labels = nested_numpify(labels_host) + all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) + + # Number of samples + if not isinstance(eval_dataset, IterableDataset): + num_samples = len(eval_dataset) + elif isinstance(eval_dataset, IterableDatasetShard): + num_samples = eval_dataset.num_examples + else: + num_samples = observed_num_examples + + # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of + # samplers has been rounded to a multiple of batch_size, so we truncate. + if all_losses is not None: + all_losses = all_losses[:num_samples] + if all_preds is not None: + all_preds = nested_truncate(all_preds, num_samples) + if all_labels is not None: + all_labels = nested_truncate(all_labels, num_samples) + # Metrics! + if self.compute_metrics is not None and all_preds is not None and all_labels is not None: + metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels, + data_info=self.get_data_info(metric_key_prefix))) + else: + metrics = {} + + # To be JSON-serializable, we need to remove numpy types or zero-d tensors + metrics = denumpify_detensorize(metrics) + + if all_losses is not None: + metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item() + + # Prefix all keys with metric_key_prefix + '_' + for key in list(metrics.keys()): + if not key.startswith(f"{metric_key_prefix}_"): + metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples) diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/trainer_args.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/trainer_args.py new file mode 100644 index 0000000..7da768d --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/trainer_args.py @@ -0,0 +1,141 @@ +from dataclasses import dataclass, field +from typing import Optional, List +from transformers import Seq2SeqTrainingArguments +# run_seq2seq parameters. + +@dataclass +class TrainingArguments(Seq2SeqTrainingArguments): + print_num_parameters: Optional[bool] = field(default=False, metadata={"help": "If set, print the parameters of " + "the model."}) + do_test: Optional[bool] = field(default=False, metadata={"help": "If set, evaluates the test performance."}) + split_validation_test: Optional[bool] = field(default=False, + metadata={"help": "If set, for the datasets which do not" + "have the test set, we use validation set as their" + "test set and make a validation set from either" + "splitting the validation set into half (for smaller" + "than 10K samples datasets), or by using 1K examples" + "from training set as validation set (for larger" + " datasets)."}) + compute_time: Optional[bool] = field(default=False, metadata={"help": "If set measures the time."}) + compute_memory: Optional[bool] = field(default=False, metadata={"help": "if set, measures the memory"}) + # prefix_length: Optional[int] = field(default=100, metadata={"help": "Defines the length for prefix tuning."}) + + + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + task_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + eval_dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the evaluation dataset to use (via the datasets library)."} + ) + eval_dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the evaluation dataset to use (via the datasets library)."} + ) + test_dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the test dataset to use (via the datasets library)."} + ) + test_dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the test dataset to use (via the datasets library)."} + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + max_source_length: Optional[int] = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + max_target_length: Optional[int] = field( + default=128, + metadata={ + "help": "The maximum total sequence length for target text after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + val_max_target_length: Optional[int] = field( + default=None, + metadata={ + "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`." + "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " + "during ``evaluate`` and ``predict``." + }, + ) + test_max_target_length: Optional[int] = field( + default=None, + metadata={ + "help": "The maximum total sequence length for test target text after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`." + "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " + "during ``evaluate`` and ``predict``." + }, + ) + pad_to_max_length: bool = field( + default=False, + metadata={ + "help": "Whether to pad all samples to model maximum sentence length. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " + "efficient on GPU but very bad for TPU." + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_val_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of validation examples to this " + "value if set." + }, + ) + max_test_samples: Optional[int] = field( + default=None, + metadata={"help": "For debugging purposes or quicker training, truncate the number of test examples to this " + "value if set."} + ) + num_beams: Optional[int] = field(default=None, metadata={"help": "Number of beams to use for evaluation."}) + ignore_pad_token_for_loss: bool = field( + default=True, + metadata={ + "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not." + }, + ) + task_adapters: Optional[List[str]] = field( + default=None, + metadata={"help": "Defines a dictionary from task adapters to the tasks."} + ) + task_embeddings: Optional[List[str]] = field( + default=None, + metadata={"help": "Defines a dictionary from tasks to the tasks embeddings."} + ) + data_sample_seed: Optional[int] = field(default=42, metadata={"help": "seed used to shuffle the data."}) + + + model_parallel: Optional[bool] = field(default=False, metadata={"help": "whether apply model parallelization"}) + + def __post_init__(self): + if self.task_name is None: + raise ValueError("Need either a dataset name or a training/validation file.") + if self.val_max_target_length is None: + self.val_max_target_length = self.max_target_length + if self.test_max_target_length is None: + self.test_max_target_length = self.max_target_length diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/trainer_utils.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/trainer_utils.py new file mode 100644 index 0000000..3b4b917 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/trainers/trainer_utils.py @@ -0,0 +1,75 @@ +import numpy as np +from typing import Union, NamedTuple, Tuple, Dict, Any +import os +import regex as re +import logging +from dataclasses import fields +import torch.nn as nn +import json + + + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +class EvalPrediction(NamedTuple): + """ + Evaluation output (always contains labels), to be used to compute metrics. + Parameters: + predictions (:obj:`np.ndarray`): Predictions of the model. + label_ids (:obj:`np.ndarray`): Targets to be matched. + data_info: (:obj:`Dict[str, Any]`): Extra dataset information, one requires + to performs the evaluation. The data_info is a dictionary with keys from + train, eval, test to specify the data_info for each split of the dataset. + """ + predictions: Union[np.ndarray, Tuple[np.ndarray]] + label_ids: np.ndarray + data_info: Dict[str, Any] + + + + + +def create_dir(output_dir): + """ + Checks whether to the output_dir already exists and creates it if not. + Args: + output_dir: path to the output_dir + """ + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + +def get_last_checkpoint(output_dir): + if os.path.exists(os.path.join(output_dir, 'pytorch_model.bin')): + return output_dir + return None + + +def pad_punctuation(text): + """Re-implementation of _pad_punctuation in t5. This function adds spaces + around punctuation. While this pads punctuation as expected, it has the + unexpected effected of padding certain unicode characters with accents, with + spaces as well. For instance: "François" becomes "Fran ç ois""" + # Pad everything except for: underscores (_), whitespace (\s), + # numbers (\p{N}), letters (\p{L}) and accent characters (\p{M}). + text = re.sub(r'([^_\s\p{N}\p{L}\p{M}])', r' \1 ', text) + # Collapse consecutive whitespace into one space. + text = re.sub(r'\s+', ' ', text) + return text + +def save_json(filepath, dictionary): + with open(filepath, "w") as outfile: + json.dump(dictionary, outfile) + + +def read_json(filepath): + f = open(filepath,) + return json.load(f) + + +def save_training_config(config_file, output_dir): + json_data = read_json(config_file) + save_json(os.path.join(output_dir, "training_config.json"), json_data) + diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/utils/__init__.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/utils/utils.py b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/utils/utils.py new file mode 100644 index 0000000..74e3528 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_seq2seq/utils/utils.py @@ -0,0 +1,15 @@ +import os +import regex as re +import logging +from dataclasses import fields +import torch.nn as nn +import json + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + + + + + diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/README.md b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/README.md new file mode 100644 index 0000000..0d16da5 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/README.md @@ -0,0 +1,58 @@ +# Text classification with OpenDelta +This repository contains the examples that uses OpenDelta to do text-classification in a traditional classification mode, i.e., with a classification head on top of the language model. Almost all of the training pipeline codes remain the same, except for some minimum changes to insert delta models onto the backbone model. + + +## Generating the json configuration file + +``` +python config_gen.py --job $job_name + +``` +The available job configuration (e.g., `--job lora_roberta-base`) can be seen from `config_gen.py`. You can also +create your only configuration. + + +## Run the code + +``` +python run_glue.py configs/$job_name/$dataset.json +``` + + +## Possible Errors + +1. +``` +ValueError: You must login to the Hugging Face hub on this computer by typing `transformers-cli login` and entering your credentials to use `use_auth_token=Tr +ue`. Alternatively, you can pass your own token as the `use_auth_token` argument. +``` +- Solution 1: Please register an account on [HuggingFace](https://huggingface.co/) +Then run transformers-cli login on your command line to enter the username and password. + +- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False + +2. +``` +OSError: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once). +``` + +- Solution 1: +``` +wget -P ~ https://github.com/git-lfs/git-lfs/releases/download/v3.0.2/git-lfs-linux-amd64-v3.0.2.tar.gz +cd ~ +tar -xvzf git-lfs-linux-amd64-v3.0.2.tar.gz +export PATH=~:$PATH # a temperary fix. To permantly add, modify your bash +git-lfs install +``` + +- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False + +3. dataset connection error + +Solution 1: open a python console, running the error command again, may not be useful + +Solution 2: download the dataset by yourself on a internect connected machine, saved to disk and transfer to your server, at last load_from_disk. + + +## Link to the original training scripts +This example repo is based on the [huggingface text-classification example](https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-classification). Thanks to the authors of the original repo. diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/config_gen.py b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/config_gen.py new file mode 100644 index 0000000..096dddb --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/config_gen.py @@ -0,0 +1,363 @@ +import collections +import copy + +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['roberta-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "metric_for_best_model"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ["eval_accuracy"] *15, + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "roberta-base", + "tokenizer_name": "roberta-base", + "save_total_limit": 1, + # For glue datasets. + # "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + # "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + + +BaseConfigs['deberta-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "metric_for_best_model"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ["eval_accuracy"] *15, + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "microsoft/deberta-v3-base", + "tokenizer_name": "microsoft/deberta-v3-base", + "save_total_limit": 1, + # For glue datasets. + # "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + # "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +BaseConfigs['deberta-v2-xlarge'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "metric_for_best_model", "gradient_accumulation_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 16, 16, 16, 16, 16, 8, 16] + [16] * 8, + [ 16, 16, 16, 16, 16, 8, 16] + [16] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ["eval_accuracy"] *15, + [4] *15, + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "microsoft/deberta-v2-xlarge", + "tokenizer_name": "microsoft/deberta-v2-xlarge", + "save_total_limit": 1, + # For glue datasets. + # "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + # "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + + +AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['bitfit_roberta-base'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/roberta-base/", + "unfrozen_modules": [ + "classifier", + "deltas" + ], + }) + +AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['adapter_roberta-base'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier", + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/roberta-base/", + }) + +AllConfigs['parallel_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['parallel_adapter_roberta-base'].update({ + "delta_type": "parallel_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier", + ], + "bottleneck_dim":24, + "output_dir": "outputs/parallel_adapter/roberta-base/", + }) + +AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['lora_roberta-base'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "common_structure": False, + "modified_modules": ['attention.query'], + # "unfrozen_modules": [ + # "deltas", + # "layer_norm", + # "final_layer_norm", + # "classifier", + # ], + "unfrozen_modules": [ + "deltas", + "LayerNorm", + "classifier", + ], + "lora_r": 8, + "output_dir": "outputs/lora/roberta-base/", + }) + +AllConfigs['compacter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['compacter_roberta-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier", + ], + "output_dir": "outputs/compacter/roberta-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['compacter++_roberta-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier", + ], + "output_dir": "outputs/compacter++/roberta-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['low_rank_adapter_roberta-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier", + ], + "output_dir": "outputs/low_rank_adapter/roberta-base/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['soft_prompt_roberta-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "unfrozen_modules": [ + "deltas", + "classifier", + ], + "output_dir": "outputs/soft_prompt/roberta-base/", + }) + +AllConfigs['prefix_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['prefix_roberta-base'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "classifier", + ], + "output_dir": "outputs/prefix/roberta-base/", + }) + +AllConfigs['soft_prompt_deberta-v2-xlarge'] = copy.deepcopy(BaseConfigs['deberta-v2-xlarge']) +AllConfigs['soft_prompt_deberta-v2-xlarge'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "unfrozen_modules": [ + "deltas", + "classifier", + ], + "output_dir": "outputs/soft_prompt/deberta-v2-xlarge/", + }) + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"./{args.job}/"): + os.mkdir(f"./{args.job}/") + + for job_name in all_config_jsons: + with open(f"./{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + + \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_cola.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_cola.json new file mode 100644 index 0000000..aa05f0a --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_cola.json @@ -0,0 +1,47 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "cola", + "evaluation_strategy": "epoch", + "greater_is_better": true, + "metric_for_best_model": "eval_matthews_correlation", + "learning_rate": 0.0004, + "load_best_model_at_end": true, + "lora_alpha": 8, + "lora_rank": 8, + "max_source_length": 512, + "model_name": "roberta", + "model_name_or_path": "roberta-base", + "non_linearity": "gelu_new", + "num_train_epochs": 80, + "output_dir": "outputs/lora/roberta-base/v2/cola", + "per_device_eval_batch_size": 100, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "save_strategy": "epoch", + "save_total_limit": 1, + "split_validation_test": true, + "task_name": "cola", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "cola", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "classifier", + "deltas" + ], + "warmup_ratio": 0.06, + "warmup_steps": 0, + "weight_decay": 0.1, + "overwrite_output_dir": true, + "push_to_hub": false +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_mnli.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_mnli.json new file mode 100644 index 0000000..06d4428 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_mnli.json @@ -0,0 +1,46 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_lr": 0.0005, + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mnli", + "evaluation_strategy": "epoch", + "greater_is_better": true, + "metric_for_best_model": "eval_accuracy", + "learning_rate": 0.0005, + "load_best_model_at_end": true, + "lora_alpha": 8, + "lora_rank": 8, + "max_source_length": 512, + "model_name": "roberta", + "model_name_or_path": "roberta-base", + "non_linearity": "gelu_new", + "num_train_epochs": 30, + "output_dir": "outputs/lora/roberta-base/v2/mnli", + "per_device_eval_batch_size": 100, + "per_device_train_batch_size": 16, + "save_strategy": "epoch", + "save_total_limit": 1, + "split_validation_test": true, + "task_name": "mnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "classifier", + "deltas" + ], + "warmup_ratio": 0.06, + "weight_decay": 0.1, + "overwrite_output_dir": true, + "push_to_hub": false +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_mrpc.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_mrpc.json new file mode 100644 index 0000000..10eeb38 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_mrpc.json @@ -0,0 +1,48 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_lr": 0.0004, + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mrpc", + "evaluation_strategy": "epoch", + "greater_is_better": true, + "metric_for_best_model": "eval_accuracy", + "learning_rate": 0.0004, + "load_best_model_at_end": true, + "lora_alpha": 8, + "lora_rank": 8, + "max_source_length": 512, + "model_name": "roberta", + "model_name_or_path": "roberta-base", + "non_linearity": "gelu_new", + "num_train_epochs": 30, + "output_dir": "outputs/lora/roberta-base/v2/mrpc", + "per_device_eval_batch_size": 100, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "save_strategy": "epoch", + "save_total_limit": 1, + "split_validation_test": true, + "task_name": "mrpc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mrpc", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "classifier", + "deltas", + "layer_norm" + ], + "warmup_ratio": 0.06, + "weight_decay": 0.1, + "overwrite_output_dir": true, + "push_to_hub": false +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_qnli.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_qnli.json new file mode 100644 index 0000000..05d28ce --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_qnli.json @@ -0,0 +1,47 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_lr": 0.0004, + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qnli", + "evaluation_strategy": "epoch", + "greater_is_better": true, + "metric_for_best_model": "eval_accuracy", + "learning_rate": 0.0004, + "load_best_model_at_end": true, + "lora_alpha": 8, + "lora_rank": 8, + "max_source_length": 512, + "model_name": "roberta", + "model_name_or_path": "roberta-base", + "non_linearity": "gelu_new", + "num_train_epochs": 25, + "output_dir": "outputs/lora/roberta-base/v2/qnli", + "per_device_eval_batch_size": 100, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "save_strategy": "epoch", + "save_total_limit": 1, + "split_validation_test": true, + "task_name": "qnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "classifier", + "deltas" + ], + "warmup_ratio": 0.06, + "weight_decay": 0.1, + "overwrite_output_dir": true, + "push_to_hub": false +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_qqp.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_qqp.json new file mode 100644 index 0000000..0ca93ec --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_qqp.json @@ -0,0 +1,47 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_lr": 0.0005, + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qqp", + "evaluation_strategy": "epoch", + "greater_is_better": true, + "metric_for_best_model": "eval_accuracy", + "learning_rate": 0.0005, + "load_best_model_at_end": true, + "lora_alpha": 8, + "lora_rank": 8, + "max_source_length": 512, + "model_name": "roberta", + "model_name_or_path": "roberta-base", + "non_linearity": "gelu_new", + "num_train_epochs": 25, + "output_dir": "outputs/lora/roberta-base/v2/qqp", + "per_device_eval_batch_size": 100, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "save_strategy": "epoch", + "save_total_limit": 1, + "split_validation_test": true, + "task_name": "qqp", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qqp", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "classifier", + "deltas" + ], + "warmup_ratio": 0.06, + "weight_decay": 0.1, + "overwrite_output_dir": true, + "push_to_hub": false +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_rte.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_rte.json new file mode 100644 index 0000000..20f98d0 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_rte.json @@ -0,0 +1,46 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "evaluation_strategy": "epoch", + "greater_is_better": true, + "metric_for_best_model": "eval_accuracy", + "learning_rate": 0.0005, + "load_best_model_at_end": true, + "lora_alpha": 8, + "lora_rank": 8, + "max_source_length": 512, + "model_name": "roberta", + "model_name_or_path": "roberta-base", + "non_linearity": "gelu_new", + "num_train_epochs": 80, + "output_dir": "outputs/lora/roberta-base/rte", + "per_device_eval_batch_size": 100, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "save_strategy": "epoch", + "save_total_limit": 1, + "split_validation_test": true, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "classifier", + "deltas" + ], + "warmup_ratio": 0.06, + "weight_decay": 0.1, + "overwrite_output_dir": true, + "push_to_hub": false +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_sst2.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_sst2.json new file mode 100644 index 0000000..767d501 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_sst2.json @@ -0,0 +1,47 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_lr": 0.0005, + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "sst2", + "evaluation_strategy": "epoch", + "metric_for_best_model": "eval_accuracy", + "greater_is_better": true, + "learning_rate": 0.0005, + "load_best_model_at_end": true, + "lora_alpha": 8, + "lora_rank": 8, + "max_source_length": 512, + "model_name": "roberta", + "model_name_or_path": "roberta-base", + "non_linearity": "gelu_new", + "num_train_epochs": 60, + "output_dir": "outputs/lora/roberta-base/v2/sst2", + "per_device_eval_batch_size": 100, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "save_strategy": "epoch", + "save_total_limit": 1, + "split_validation_test": true, + "task_name": "sst2", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "sst2", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "classifier", + "deltas" + ], + "warmup_ratio": 0.06, + "weight_decay": 0.1, + "overwrite_output_dir": true, + "push_to_hub": false +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_stsb.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_stsb.json new file mode 100644 index 0000000..827b139 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_stsb.json @@ -0,0 +1,47 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_lr": 0.0004, + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "stsb", + "evaluation_strategy": "epoch", + "greater_is_better": true, + "metric_for_best_model": "eval_pearson", + "learning_rate": 0.0004, + "load_best_model_at_end": true, + "lora_alpha": 8, + "lora_rank": 8, + "max_source_length": 512, + "model_name": "roberta", + "model_name_or_path": "roberta-base", + "non_linearity": "gelu_new", + "num_train_epochs": 40, + "output_dir": "outputs/lora/roberta-base/v2/stsb", + "per_device_eval_batch_size": 100, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "save_strategy": "epoch", + "save_total_limit": 1, + "split_validation_test": true, + "task_name": "stsb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "stsb", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "classifier", + "deltas" + ], + "warmup_ratio": 0.06, + "weight_decay": 0.1, + "overwrite_output_dir": true, + "push_to_hub": false +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_wnli.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_wnli.json new file mode 100644 index 0000000..941cddb --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_wnli.json @@ -0,0 +1,48 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_lr": 0.0005, + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "wnli", + "evaluation_strategy": "epoch", + "greater_is_better": true, + "metric_for_best_model": "eval_pearson", + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "lora_alpha": 8, + "lora_rank": 8, + "max_source_length": 512, + "model_name": "roberta", + "model_name_or_path": "roberta-base", + "non_linearity": "gelu_new", + "num_train_epochs": 30, + "output_dir": "outputs/lora/roberta-base/v2/wnli", + "per_device_eval_batch_size": 100, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "save_strategy": "epoch", + "save_total_limit": 1, + "split_validation_test": true, + "task_name": "wnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "wnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "classifier", + "deltas" + ], + "warmup_ratio": 0.06, + "warmup_steps": 0, + "weight_decay": 0.1, + "overwrite_output_dir": true, + "push_to_hub": false +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/cola.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/cola.json new file mode 100644 index 0000000..093e646 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/cola.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "cola", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/cola", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "cola", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "cola", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/mnli.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/mnli.json new file mode 100644 index 0000000..a0dc9ec --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/mnli.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/mnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "mnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/mrpc.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/mrpc.json new file mode 100644 index 0000000..9c9c060 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/mrpc.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mrpc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/mrpc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "mrpc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mrpc", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/qnli.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/qnli.json new file mode 100644 index 0000000..021ee0e --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/qnli.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/qnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "qnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/qqp.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/qqp.json new file mode 100644 index 0000000..be3afde --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/qqp.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qqp", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/qqp", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "qqp", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qqp", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/rte.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/rte.json new file mode 100644 index 0000000..3a1710f --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/rte.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/sst2.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/sst2.json new file mode 100644 index 0000000..21b6f89 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/sst2.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "sst2", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/sst2", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "sst2", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "sst2", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/stsb.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/stsb.json new file mode 100644 index 0000000..5845f4f --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/stsb.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "stsb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/stsb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "stsb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "stsb", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-boolq.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-boolq.json new file mode 100644 index 0000000..48747fe --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-boolq.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-cb.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-cb.json new file mode 100644 index 0000000..2e8a874 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-cb.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-cb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-cb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-cb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-cb", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-copa.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-copa.json new file mode 100644 index 0000000..46c7216 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-copa.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-copa", + "eval_steps": 50, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 40, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-copa", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 50, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-copa", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-copa", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-multirc.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-multirc.json new file mode 100644 index 0000000..60ba873 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-multirc.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-multirc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-multirc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-multirc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-multirc", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-record.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-record.json new file mode 100644 index 0000000..4ce9097 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-record.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-record", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 512, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-record", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-record", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-record", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wic.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wic.json new file mode 100644 index 0000000..c920a7a --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wic.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wic", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-wic", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-wic", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wic", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wsc.fixed.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wsc.fixed.json new file mode 100644 index 0000000..563af04 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wsc.fixed.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wsc.fixed", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-wsc.fixed", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-wsc.fixed", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wsc.fixed", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/cola.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/cola.json new file mode 100644 index 0000000..eafe735 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/cola.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "cola", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/cola", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "cola", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "cola", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/mnli.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/mnli.json new file mode 100644 index 0000000..d134b54 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/mnli.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/prefix/roberta-base/mnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "mnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/mrpc.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/mrpc.json new file mode 100644 index 0000000..8ee3263 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/mrpc.json @@ -0,0 +1,47 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mrpc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/mrpc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "mrpc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mrpc", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "modified_modules":[ + "[r][0-5]\\.attention" + ], + "reparameterize": false, + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/qnli.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/qnli.json new file mode 100644 index 0000000..3e5142e --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/qnli.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/prefix/roberta-base/qnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "qnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/qqp.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/qqp.json new file mode 100644 index 0000000..d36f69a --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/qqp.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qqp", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/prefix/roberta-base/qqp", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "qqp", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qqp", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/rte.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/rte.json new file mode 100644 index 0000000..d6c8470 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/rte.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/sst2.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/sst2.json new file mode 100644 index 0000000..a583cce --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/sst2.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "sst2", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/prefix/roberta-base/sst2", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "sst2", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "sst2", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/stsb.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/stsb.json new file mode 100644 index 0000000..63dd100 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/stsb.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "stsb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/stsb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "stsb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "stsb", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-boolq.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-boolq.json new file mode 100644 index 0000000..013892a --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-boolq.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-cb.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-cb.json new file mode 100644 index 0000000..4513356 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-cb.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-cb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/superglue-cb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-cb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-cb", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-copa.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-copa.json new file mode 100644 index 0000000..59d4f70 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-copa.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-copa", + "eval_steps": 50, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 40, + "output_dir": "outputs/prefix/roberta-base/superglue-copa", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 50, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-copa", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-copa", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-multirc.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-multirc.json new file mode 100644 index 0000000..3ea0c77 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-multirc.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-multirc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/prefix/roberta-base/superglue-multirc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-multirc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-multirc", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-record.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-record.json new file mode 100644 index 0000000..1251019 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-record.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-record", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 512, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/prefix/roberta-base/superglue-record", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-record", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-record", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-wic.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-wic.json new file mode 100644 index 0000000..e5d9f12 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-wic.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wic", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/superglue-wic", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-wic", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wic", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-wsc.fixed.json b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-wsc.fixed.json new file mode 100644 index 0000000..ee224df --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-wsc.fixed.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wsc.fixed", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/superglue-wsc.fixed", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-wsc.fixed", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wsc.fixed", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/metrics/glue.py b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/metrics/glue.py new file mode 100644 index 0000000..ffd0fc1 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/metrics/glue.py @@ -0,0 +1,156 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" GLUE benchmark metric. """ + +from scipy.stats import pearsonr, spearmanr +from sklearn.metrics import f1_score, matthews_corrcoef + +import datasets + + +_CITATION = """\ +@inproceedings{wang2019glue, + title={{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding}, + author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.}, + note={In the Proceedings of ICLR.}, + year={2019} +} +""" + +_DESCRIPTION = """\ +GLUE, the General Language Understanding Evaluation benchmark +(https://gluebenchmark.com/) is a collection of resources for training, +evaluating, and analyzing natural language understanding systems. +""" + +_KWARGS_DESCRIPTION = """ +Compute GLUE evaluation metric associated to each GLUE dataset. +Args: + predictions: list of predictions to score. + Each translation should be tokenized into a list of tokens. + references: list of lists of references for each translation. + Each reference should be tokenized into a list of tokens. +Returns: depending on the GLUE subset, one or several of: + "accuracy": Accuracy + "f1": F1 score + "pearson": Pearson Correlation + "spearmanr": Spearman Correlation + "matthews_correlation": Matthew Correlation +Examples: + + >>> glue_metric = datasets.load_metric('glue', 'sst2') # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"] + >>> references = [0, 1] + >>> predictions = [0, 1] + >>> results = glue_metric.compute(predictions=predictions, references=references) + >>> print(results) + {'accuracy': 1.0} + + >>> glue_metric = datasets.load_metric('glue', 'mrpc') # 'mrpc' or 'qqp' + >>> references = [0, 1] + >>> predictions = [0, 1] + >>> results = glue_metric.compute(predictions=predictions, references=references) + >>> print(results) + {'accuracy': 1.0, 'f1': 1.0} + + >>> glue_metric = datasets.load_metric('glue', 'stsb') + >>> references = [0., 1., 2., 3., 4., 5.] + >>> predictions = [0., 1., 2., 3., 4., 5.] + >>> results = glue_metric.compute(predictions=predictions, references=references) + >>> print({"pearson": round(results["pearson"], 2), "spearmanr": round(results["spearmanr"], 2)}) + {'pearson': 1.0, 'spearmanr': 1.0} + + >>> glue_metric = datasets.load_metric('glue', 'cola') + >>> references = [0, 1] + >>> predictions = [0, 1] + >>> results = glue_metric.compute(predictions=predictions, references=references) + >>> print(results) + {'matthews_correlation': 1.0} +""" + + +def simple_accuracy(preds, labels): + return float((preds == labels).mean()) + + +def acc_and_f1(preds, labels): + acc = simple_accuracy(preds, labels) + f1 = float(f1_score(y_true=labels, y_pred=preds)) + return { + "accuracy": acc, + "f1": f1, + } + + +def pearson_and_spearman(preds, labels): + pearson_corr = float(pearsonr(preds, labels)[0]) + spearman_corr = float(spearmanr(preds, labels)[0]) + return { + "pearson": pearson_corr, + "spearmanr": spearman_corr, + } + + +@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class Glue(datasets.Metric): + def _info(self): + if self.config_name not in [ + "sst2", + "mnli", + "mnli_mismatched", + "mnli_matched", + "cola", + "stsb", + "mrpc", + "qqp", + "qnli", + "rte", + "wnli", + "hans", + ]: + raise KeyError( + "You should supply a configuration name selected in " + '["sst2", "mnli", "mnli_mismatched", "mnli_matched", ' + '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]' + ) + return datasets.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features( + { + "predictions": datasets.Value("int64" if self.config_name != "stsb" else "float32"), + "references": datasets.Value("int64" if self.config_name != "stsb" else "float32"), + } + ), + codebase_urls=[], + reference_urls=[], + format="numpy", + ) + + def _compute(self, predictions, references): + if self.config_name == "cola": + return {"matthews_correlation": matthews_corrcoef(references, predictions)} + elif self.config_name == "stsb": + return pearson_and_spearman(predictions, references) + elif self.config_name in ["mrpc", "qqp"]: + return acc_and_f1(predictions, references) + elif self.config_name in ["sst2", "mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]: + return {"accuracy": simple_accuracy(predictions, references)} + else: + raise KeyError( + "You should supply a configuration name selected in " + '["sst2", "mnli", "mnli_mismatched", "mnli_matched", ' + '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]' + ) \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/requirements.txt b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/requirements.txt new file mode 100644 index 0000000..8d8ff7a --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/requirements.txt @@ -0,0 +1,8 @@ +accelerate +datasets >= 1.8.0 +sentencepiece != 0.1.92 +scipy +scikit-learn +protobuf +torch >= 1.3 +argunparse \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/run.sh b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/run.sh new file mode 100644 index 0000000..e7363d6 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/run.sh @@ -0,0 +1,7 @@ +files=(cola mnli mrpc qnli qqp rte sst2 stsb superglue-boolq superglue-cb superglue-copa superglue-multirc superglue-record superglue-wic superglue-wsc.fixed) +for ((i=$1; i<=$2; i++)) +do + dataset=${files[i]} + echo "id$i:$dataset" + TOKENIZERS_PARALLELISM=false python run_glue.py configs/$3/$dataset.json +done \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/run_glue.py b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/run_glue.py new file mode 100644 index 0000000..9fc403f --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/run_glue.py @@ -0,0 +1,633 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Finetuning the library models for sequence classification on GLUE.""" +# You can also adapt this script on your own text classification task. Pointers for this are left as comments. + +import argparse +import dataclasses +import json +import logging +import os +from pathlib import Path +import random +import re +import sys +from dataclasses import dataclass, field +from typing import Optional + +import datasets +import numpy as np +from datasets import load_dataset, load_metric +# from opendelta.utils.delta_center import create_hub_repo_name + +import transformers +from transformers import ( + AutoConfig, + AutoModelForSequenceClassification, + AutoTokenizer, + DataCollatorWithPadding, + EvalPrediction, + HfArgumentParser, + PretrainedConfig, + # Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.trainer import Trainer + +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +# check_min_version("4.16.0.dev0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") + +task_to_keys = { + "cola": ("sentence", None), + "mnli": ("premise", "hypothesis"), + "mrpc": ("sentence1", "sentence2"), + "qnli": ("question", "sentence"), + "qqp": ("question1", "question2"), + "rte": ("sentence1", "sentence2"), + "sst2": ("sentence", None), + "stsb": ("sentence1", "sentence2"), + "wnli": ("sentence1", "sentence2"), +} + +logger = logging.getLogger(__name__) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + + Using `HfArgumentParser` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + task_name: Optional[str] = field( + default=None, + metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())}, + ) + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + max_seq_length: int = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} + ) + pad_to_max_length: bool = field( + default=True, + metadata={ + "help": "Whether to pad all samples to `max_seq_length`. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch." + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + max_predict_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " + "value if set." + }, + ) + train_file: Optional[str] = field( + default=None, metadata={"help": "A csv or a json file containing the training data."} + ) + validation_file: Optional[str] = field( + default=None, metadata={"help": "A csv or a json file containing the validation data."} + ) + test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."}) + + def __post_init__(self): + if self.task_name is not None: + self.task_name = self.task_name.lower() + if self.task_name not in task_to_keys.keys(): + raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys())) + elif self.dataset_name is not None: + pass + elif self.train_file is None or self.validation_file is None: + raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.") + else: + train_extension = self.train_file.split(".")[-1] + assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file." + validation_extension = self.validation_file.split(".")[-1] + assert ( + validation_extension == train_extension + ), "`validation_file` should have the same extension (csv or json) as `train_file`." + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + +class RemainArgHfArgumentParser(HfArgumentParser): + def parse_json_file(self, json_file: str, return_remaining_args=True ): + """ + Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the + dataclass types. + """ + data = json.loads(Path(json_file).read_text()) + outputs = [] + for dtype in self.dataclass_types: + keys = {f.name for f in dataclasses.fields(dtype) if f.init} + inputs = {k: data.pop(k) for k in list(data.keys()) if k in keys} + obj = dtype(**inputs) + outputs.append(obj) + + remain_args = argparse.ArgumentParser() + remain_args.__dict__.update(data) + if return_remaining_args: + return (*outputs, remain_args) + else: + return (*outputs,) + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + json_file=os.path.abspath(sys.argv[1]) + model_args, data_args, training_args, delta_args = parser.parse_json_file(json_file, return_remaining_args=True) #args = arg_string, return_remaining_strings=True) #parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args, delta_args = parser.parse_args_into_dataclasses() + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) + # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the + # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named + # label if at least two columns are provided. + # + # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this + # single column. You can easily tweak this behavior (see below) + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.task_name is not None: + # Downloading and loading a dataset from the hub. + + raw_datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir) + # if you encounter error here + # download the dataset, save to disk and then load_from_disk + # from datasets import load_from_disk + # raw_datasets = load_from_disk(f"../../../../huggingface_datasets/saved_to_disk/glue.{data_args.task_name}") + + elif data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + ) + else: + # Loading a dataset from your local files. + # CSV/JSON training and evaluation files are needed. + data_files = {"train": data_args.train_file, "validation": data_args.validation_file} + + # Get the test dataset: you can provide your own CSV/JSON test file (see below) + # when you use `do_predict` without specifying a GLUE benchmark task. + if training_args.do_predict: + if data_args.test_file is not None: + train_extension = data_args.train_file.split(".")[-1] + test_extension = data_args.test_file.split(".")[-1] + assert ( + test_extension == train_extension + ), "`test_file` should have the same extension (csv or json) as `train_file`." + data_files["test"] = data_args.test_file + else: + raise ValueError("Need either a GLUE task or a test file for `do_predict`.") + + for key in data_files.keys(): + logger.info(f"load a local file for {key}: {data_files[key]}") + + if data_args.train_file.endswith(".csv"): + # Loading a dataset from local csv files + raw_datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) + else: + # Loading a dataset from local json files + raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) + # See more about loading any type of standard or custom dataset at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Labels + if data_args.task_name is not None: + is_regression = data_args.task_name == "stsb" + if not is_regression: + label_list = raw_datasets["train"].features["label"].names + num_labels = len(label_list) + else: + num_labels = 1 + else: + # Trying to have good defaults here, don't hesitate to tweak to your needs. + is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"] + if is_regression: + num_labels = 1 + else: + # A useful fast method: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique + label_list = raw_datasets["train"].unique("label") + label_list.sort() # Let's sort it for determinism + num_labels = len(label_list) + + # Load pretrained model and tokenizer + # + # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + num_labels=num_labels, + finetuning_task=data_args.task_name, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + model = AutoModelForSequenceClassification.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + if delta_args.delta_type.lower() != "none": + from opendelta import AutoDeltaConfig + from opendelta.auto_delta import AutoDeltaModel + delta_config = AutoDeltaConfig.from_dict(vars(delta_args)) + delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=model) + delta_model.freeze_module(set_state_dict = True) + delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True) + + + + + + + + + + + # Preprocessing the raw_datasets + if data_args.task_name is not None: + sentence1_key, sentence2_key = task_to_keys[data_args.task_name] + else: + # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. + non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"] + if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: + sentence1_key, sentence2_key = "sentence1", "sentence2" + else: + if len(non_label_column_names) >= 2: + sentence1_key, sentence2_key = non_label_column_names[:2] + else: + sentence1_key, sentence2_key = non_label_column_names[0], None + + # Padding strategy + if data_args.pad_to_max_length: + padding = "max_length" + else: + # We will pad later, dynamically at batch creation, to the max sequence length in each batch + padding = False + + # Some models have set the order of the labels to use, so let's make sure we do use it. + label_to_id = None + if ( + model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id + and data_args.task_name is not None + and not is_regression + ): + # Some have all caps in their config, some don't. + label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} + if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): + label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)} + else: + logger.warning( + "Your model seems to have been trained with labels, but they don't match the dataset: ", + f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." + "\nIgnoring the model labels as a result.", + ) + elif data_args.task_name is None and not is_regression: + label_to_id = {v: i for i, v in enumerate(label_list)} + + if label_to_id is not None: + model.config.label2id = label_to_id + model.config.id2label = {id: label for label, id in config.label2id.items()} + elif data_args.task_name is not None and not is_regression: + model.config.label2id = {l: i for i, l in enumerate(label_list)} + model.config.id2label = {id: label for label, id in config.label2id.items()} + + if data_args.max_seq_length > tokenizer.model_max_length: + logger.warning( + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." + ) + max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) + + def preprocess_function(examples): + # Tokenize the texts + args = ( + (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) + ) + result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) + + # Map labels to IDs (not necessary for GLUE tasks) + if label_to_id is not None and "label" in examples: + result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] + return result + + with training_args.main_process_first(desc="dataset map pre-processing"): + raw_datasets = raw_datasets.map( + preprocess_function, + batched=True, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + if training_args.do_train: + if "train" not in raw_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = raw_datasets["train"] + if data_args.max_train_samples is not None: + train_dataset = train_dataset.select(range(data_args.max_train_samples)) + + if training_args.do_eval: + if "validation" not in raw_datasets and "validation_matched" not in raw_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] + if data_args.max_eval_samples is not None: + eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + + if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None: + if "test" not in raw_datasets and "test_matched" not in raw_datasets: + raise ValueError("--do_predict requires a test dataset") + predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"] + if data_args.max_predict_samples is not None: + predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + + # Log a few random samples from the training set: + if training_args.do_train: + for index in random.sample(range(len(train_dataset)), 3): + logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") + + # Get the metric function + if data_args.task_name is not None: + # metric = load_metric("glue", data_args.task_name) + metric = load_metric("./metrics/glue.py", data_args.task_name) + else: + metric = load_metric("accuracy") + + # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a + # predictions and label_ids field) and has to return a dictionary string to float. + def compute_metrics(p: EvalPrediction): + preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions + preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) + if data_args.task_name is not None: + result = metric.compute(predictions=preds, references=p.label_ids) + if len(result) > 1: + result["combined_score"] = np.mean(list(result.values())).item() + return result + elif is_regression: + return {"mse": ((preds - p.label_ids) ** 2).mean().item()} + else: + return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} + + # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. + if data_args.pad_to_max_length: + data_collator = default_data_collator + elif training_args.fp16: + data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) + else: + data_collator = None + + # Initialize our Trainer + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + compute_metrics=compute_metrics, + tokenizer=tokenizer, + data_collator=data_collator, + ) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + metrics = train_result.metrics + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.save_model() # Saves the tokenizer too for easy upload + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + results = {} + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + # Loop to handle MNLI double evaluation (matched, mis-matched) + tasks = [data_args.task_name] + eval_datasets = [eval_dataset] + if data_args.task_name == "mnli": + tasks.append("mnli-mm") + eval_datasets.append(raw_datasets["validation_mismatched"]) + + for eval_dataset, task in zip(eval_datasets, tasks): + metrics = trainer.evaluate(eval_dataset=eval_dataset) + + max_eval_samples = ( + data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + ) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + results['eval'] = metrics + + if training_args.do_predict: + logger.info("*** Predict ***") + + # Loop to handle MNLI double evaluation (matched, mis-matched) + tasks = [data_args.task_name] + predict_datasets = [predict_dataset] + if data_args.task_name == "mnli": + tasks.append("mnli-mm") + predict_datasets.append(raw_datasets["test_mismatched"]) + + for predict_dataset, task in zip(predict_datasets, tasks): + # Removing the `label` columns because it contains -1 and Trainer won't like that. + predict_dataset = predict_dataset.remove_columns("label") + predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions + predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) + + output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt") + if trainer.is_world_process_zero(): + with open(output_predict_file, "w") as writer: + logger.info(f"***** Predict results {task} *****") + writer.write("index\tprediction\n") + for index, item in enumerate(predictions): + if is_regression: + writer.write(f"{index}\t{item:3.3f}\n") + else: + item = label_list[item] + writer.write(f"{index}\t{item}\n") + + # from IPython import embed; embed() + + # kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"} + # if data_args.task_name is not None: + # kwargs["language"] = "en" + # kwargs["dataset_tags"] = "glue" + # kwargs["dataset_args"] = data_args.task_name + # kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}" + # kwargs["delta_type"] = delta_args.delta_type + + repo_name = create_hub_repo_name(root="DeltaHub", + dataset=data_args.task_name, + delta_type = delta_args.delta_type, + model_name_or_path= model_args.model_name_or_path) + + if training_args.push_to_hub: # TODO add description here + delta_model.save_finetuned(push_to_hub=True, save_directory=repo_name, use_auth_token=True) + # trainer.push_to_hub(**kwargs) + else: + delta_model.save_finetuned(push_to_hub=False, save_directory=repo_name, use_auth_token=True) + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/OpenDelta-0.3.2/examples/legacies/examples_text-classification/util.py b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/util.py new file mode 100644 index 0000000..50393e0 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/examples_text-classification/util.py @@ -0,0 +1,75 @@ +from datasets import load_dataset, load_metric +import torch +import logging + + +logger = logging.getLogger(__name__) + + +class DataLoader: + small_datasets_without_all_splits = ["cola", "wnli", "rte", "superglue-cb", "superglue-copa", "superglue-multirc", + "superglue-wic", "superglue-wsc.fixed", "superglue-rte", "mrpc", "stsb", + "superglue-boolq"] + large_data_without_all_splits = ["qqp", "qnli", "superglue-record", "sst2"] + + def __init__(self, raw_datasets, data_args, model_args, training_args): + self.raw_datasets = raw_datasets + self.data_args = data_args + self.model_args = model_args + self.training_args = training_args + + def shuffled_indices(self, dataset): + num_samples = len(dataset) + generator = torch.Generator() + generator.manual_seed(self.training_args.seed) + return torch.randperm(num_samples, generator=generator).tolist() + + def subsample(self, dataset, indices=None): + """ + Given a dataset returns the subsampled dataset. + :param n_obs: the number of samples of the subsampled dataset. + :param indices: indices to select the samples from, if not given, indices are computed + from by shuffling the given dataset. + :return: subsampled dataset. + """ + if indices is None: + indices = self.shuffled_indices(dataset) + return dataset.select(indices) + + def get_split_indices(self, split, dataset, validation_size): + indices = self.shuffled_indices(dataset) + if split == "validation": + return indices[:validation_size] + else: + return indices[validation_size:] + + def get(self, split): + if self.data_args.task_name == 'mnli': + if split == 'validation': + split = 'validation_mismatched' + elif split == 'test': + split = 'validation_matched' + return self.raw_datasets[split] + # For small datasets (n_samples < 10K) without test set, we divide validation set to + # half, use one half as test set and one half as validation set. + if self.data_args.task_name in self.small_datasets_without_all_splits \ + and split != "train": + logger.info("Split validation set into test and validation set.") + dataset = self.raw_datasets['validation'] + indices = self.get_split_indices(split, dataset, validation_size=len(dataset)//2) + dataset = self.subsample(dataset, indices) + # For larger datasets (n_samples > 10K), we divide training set into 1K as + # validation and the rest as training set, keeping the original validation + # set as the test set. + elif self.data_args.task_name in self.large_data_without_all_splits \ + and split != "test": + logger.info("Split training set into train and validation set, use validation set as test set.") + dataset = self.raw_datasets['train'] + indices = self.get_split_indices(split, dataset, validation_size=1000) + dataset = self.subsample(dataset, indices) + elif split == 'train': + dataset = self.raw_datasets[split] + else: + assert split == 'test', print("expected test, but got {}".format(split)) + dataset = self.raw_datasets[split] + return dataset \ No newline at end of file diff --git a/OpenDelta-0.3.2/examples/legacies/setup_seq2seq.py b/OpenDelta-0.3.2/examples/legacies/setup_seq2seq.py new file mode 100644 index 0000000..acecd77 --- /dev/null +++ b/OpenDelta-0.3.2/examples/legacies/setup_seq2seq.py @@ -0,0 +1,41 @@ +"""Install Compacter.""" +import os +import setuptools +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +#os.environ['TORCH_CUDA_ARCH_LIST']="3.5;3.7;6.1;7.0;7.5;8.6+PTX" + +def setup_package(): + long_description = "examples_seq2seq" + setuptools.setup( + name='examples_seq2seq', + version='0.0.1', + description='seq2seq example', + long_description=long_description, + long_description_content_type='text/markdown', + author='Shengding Hu', + license='MIT License', + packages=setuptools.find_packages( + exclude=['docs', 'tests', 'scripts']), + dependency_links=[ + 'https://download.pytorch.org/whl/torch_stable.html', + ], + classifiers=[ + 'Intended Audience :: Developers', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: MIT License', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.7.10', + ], + keywords='text nlp machinelearning', + cmdclass={"build_ext": BuildExtension}, + install_requires=[ + "pyarrow==7.0.0", + "datasets==1.17.0" + ], + ) + + +if __name__ == '__main__': + setup_package() diff --git a/OpenDelta-0.3.2/opendelta/__init__.py b/OpenDelta-0.3.2/opendelta/__init__.py new file mode 100644 index 0000000..c140b36 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/__init__.py @@ -0,0 +1,29 @@ + +__version__ = "0.3.0" + +class GlobalSetting: + def __init__(self): + self.axis_order = [0,1,2] + + +global_setting = GlobalSetting() + +from .delta_configs import BaseDeltaConfig +from .utils import logging +from .utils.saving_loading_utils import SaveLoadMixin +from .basemodel import DeltaBase +from .auto_delta import AutoDeltaConfig, AutoDeltaModel +from .utils.structure_mapping import CommonStructureMap +from .delta_models.lora import LoraModel +from .delta_models.bitfit import BitFitModel +from .delta_models.compacter import CompacterModel +from .delta_models.adapter import AdapterModel +from .delta_models.prefix import PrefixModel +from .delta_models.soft_prompt import SoftPromptModel +from .delta_models.low_rank_adapter import LowRankAdapterModel +from .delta_models.parallel_adapter import ParallelAdapterModel + + + +def set_axis_order(axis_order=[0,1,2]): + setattr(global_setting, "axis_order", axis_order) \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/auto_delta.py b/OpenDelta-0.3.2/opendelta/auto_delta.py new file mode 100644 index 0000000..32ff18c --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/auto_delta.py @@ -0,0 +1,392 @@ +from copy import deepcopy +from typing import Any, Dict, OrderedDict +from bigmodelvis import Visualization +import torch.nn as nn +from opendelta.utils.logging import get_logger +import importlib +from opendelta.delta_configs import BaseDeltaConfig +from opendelta.basemodel import DeltaBase +logger = get_logger(__name__) + + +DELTA_CONFIG_MAPPING = { + "lora": "LoraConfig", + "low_rank_adapter": "LowRankAdapterConfig", + "bitfit": "BitFitConfig", + "adapter":"AdapterConfig", + "compacter":"CompacterConfig", + "prefix": "PrefixConfig", + "soft_prompt": "SoftPromptConfig", + "parallel_adapter": "ParallelAdapterConfig", +} + +DELTA_MODEL_MAPPING = { + "lora": "LoraModel", + "low_rank_adapter": "LowRankAdapterModel", + "bitfit": "BitFitModel", + "adapter":"AdapterModel", + "compacter": "CompacterModel", + "prefix": "PrefixModel", + "soft_prompt": "SoftPromptModel", + "parallel_adapter": "ParallelAdapterModel", +} + +class _LazyConfigMapping(OrderedDict): + """ + A dictionary that lazily load its values when they are requested. + """ + + def __init__(self, mapping): + self._mapping = mapping + self._extra_content = {} + self._modules = {} + + def __getitem__(self, key): + if key in self._extra_content: + return self._extra_content[key] + if key not in self._mapping: + raise KeyError(key) + value = self._mapping[key] + module_name = key #model_type_to_module_name(key) + # if module_name not in self._modules: + self._modules[module_name] = importlib.import_module(f".{module_name}", "opendelta.delta_models") + return getattr(self._modules[module_name], value) + + def keys(self): + return list(self._mapping.keys()) + list(self._extra_content.keys()) + + def values(self): + return [self[k] for k in self._mapping.keys()] + list(self._extra_content.values()) + + def items(self): + return [(k, self[k]) for k in self._mapping.keys()] + list(self._extra_content.items()) + + def __iter__(self): + return iter(list(self._mapping.keys()) + list(self._extra_content.keys())) + + def __contains__(self, item): + return item in self._mapping or item in self._extra_content + + def register(self, key, value): + """ + Register a new configuration in this mapping. + """ + if key in self._mapping.keys(): + raise ValueError(f"'{key}' is already used by a Transformers config, pick another name.") + self._extra_content[key] = value + + +LAZY_CONFIG_MAPPING = _LazyConfigMapping(DELTA_CONFIG_MAPPING) + + + +class AutoDeltaConfig: + r""" + This is a generic configuration class that will be instantiated as one of the configuration classes of the library + when created with the :meth:`~AutoDeltaConfig.from_finetuned` or :meth:`~AutoDeltaConfig.from_dict` class method. + This class cannot be instantiated directly using ``__init__()`` (throws an error). + """ + + def __init__(self, *args, **kwargs): + raise AttributeError( + f"{self.__class__.__name__} is designed to be instantiated using\n\t(1) `{self.__class__.__name__}.from_finetuned(finetuned_model_name_or_path)`\nor\t(2) `{self.__class__.__name__}.from_dict(config_dict, **kwargs)` " + ) + + @classmethod + def from_dict(cls, config_dict: Dict[str, Any], **kwargs): + r""" Instantiate a DeltaConfig according to the dict. Automatically load the config specified by + :obj:`delta_type`. + + Args: + config_dict (:obj:`dict`): The dict of configs of delta model. + kwargs: Other keyword argument pass to initialize the config. + + Examples: + + .. code-block:: python + + config = AutoDeltaConfig.from_dict({"delta_type":"lora"}) # This will load the dault lora config. + config = AutoDeltaConfig.from_dict({"delta_type":"lora", "lora_r":5}) # Will load the default lora config, with lora_r = 5 + + """ + config_dict = deepcopy(config_dict) + delta_type = config_dict.pop("delta_type", None) + if delta_type is None: + raise RuntimeError("Do not specify a delta type, cannot load the default config") + config_class = LAZY_CONFIG_MAPPING[delta_type] + return config_class.from_dict(config_dict, **kwargs) + + + @classmethod + def from_finetuned(cls, finetuned_delta_path, **kwargs): + r""" + Instantiate one of the configuration classes of the library from a finetuned delta model configuration. + The configuration class to instantiate is selected based on the ``delta_type`` property of the config object that + is loaded. + + Parameters: + + finetuned_delta_path (:obj:`str` or :obj:`os.PathLike`, *optional*): Can be either: + + - A string, the model id of a finetuned delta model configuration hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like ``Davin/lora``, or namespaced under a user or organization name, like ``DeltaHub/lora_t5-base_mrpc``. + - A path to a *directory* containing a configuration file saved using the :py:meth:`~opendelta.basemodel.DeltaBase.save_finetuned` method, e.g., ``./my_model_directory/``. + - A path or url to a saved configuration JSON *file*, e.g.,``./my_model_directory/configuration.json``. + + cache_dir (:obj:`str` or :obj:`os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + + Examples: + + .. code-block:: python + + from transformers import AutoConfig + delta_config = AutoDeltaConfig.from_finetuned("thunlp/FactQA_T5-large_Adapter") + + """ + + + config_dict, kwargs = BaseDeltaConfig.get_config_dict(finetuned_delta_path, **kwargs) + if "delta_type" in config_dict: + config_class = LAZY_CONFIG_MAPPING[config_dict["delta_type"]] + return config_class.from_dict(config_dict, **kwargs) + else: + # Fallback: use pattern matching on the string. + for pattern, config_class in LAZY_CONFIG_MAPPING.items(): + if pattern in str(finetuned_delta_path): + return config_class.from_dict(config_dict, **kwargs) + + raise ValueError( + f"Unrecognized model in {finetuned_delta_path}. " + f"Should have a `delta_type` key in the loaded config, or contain one of the following strings " + f"in its name: {', '.join(LAZY_CONFIG_MAPPING.keys())}" + ) + +### AutoModels below + +class _LazyAutoMapping(OrderedDict): + """ + " A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed. + + Args: + + - config_mapping: The map model type to config class + - model_mapping: The map model type to model (or tokenizer) class + """ + + def __init__(self, config_mapping, model_mapping): + self._config_mapping = config_mapping + self._reverse_config_mapping = {v: k for k, v in config_mapping.items()} + self._model_mapping = model_mapping + self._extra_content = {} + self._modules = {} + + def __getitem__(self, key): + if key in self._extra_content: + return self._extra_content[key] + model_type = self._reverse_config_mapping[key.__name__] + if model_type not in self._model_mapping: + raise KeyError(key) + model_name = self._model_mapping[model_type] + return self._load_attr_from_module(model_type, model_name) + + def _load_attr_from_module(self, model_type, attr): + if model_type not in self._modules: + self._modules[model_type] = importlib.import_module(f".{model_type}", "opendelta.delta_models") + return getattribute_from_module(self._modules[model_type], attr) + + def keys(self): + mapping_keys = [ + self._load_attr_from_module(key, name) + for key, name in self._config_mapping.items() + if key in self._model_mapping.keys() + ] + return mapping_keys + list(self._extra_content.keys()) + + def get(self, key, default): + try: + return self.__getitem__(key) + except KeyError: + return default + + def __bool__(self): + return bool(self.keys()) + + def values(self): + mapping_values = [ + self._load_attr_from_module(key, name) + for key, name in self._model_mapping.items() + if key in self._config_mapping.keys() + ] + return mapping_values + list(self._extra_content.values()) + + def items(self): + mapping_items = [ + ( + self._load_attr_from_module(key, self._config_mapping[key]), + self._load_attr_from_module(key, self._model_mapping[key]), + ) + for key in self._model_mapping.keys() + if key in self._config_mapping.keys() + ] + return mapping_items + list(self._extra_content.items()) + + def __iter__(self): + return iter(self.keys()) + + def __contains__(self, item): + if item in self._extra_content: + return True + if not hasattr(item, "__name__") or item.__name__ not in self._reverse_config_mapping: + return False + model_type = self._reverse_config_mapping[item.__name__] + return model_type in self._model_mapping + + def register(self, key, value): + """ + Register a new model in this mapping. + """ + if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping: + model_type = self._reverse_config_mapping[key.__name__] + if model_type in self._model_mapping.keys(): + raise ValueError(f"'{key}' is already used by a Transformers model.") + + self._extra_content[key] = value + + + +LAZY_DELTA_MAPPING = _LazyAutoMapping(DELTA_CONFIG_MAPPING, DELTA_MODEL_MAPPING) + + + +def get_values(model_mapping): + result = [] + for model in model_mapping.values(): + if isinstance(model, (list, tuple)): + result += list(model) + else: + result.append(model) + + return result + + +def getattribute_from_module(module, attr): + if attr is None: + return None + if isinstance(attr, tuple): + return tuple(getattribute_from_module(module, a) for a in attr) + if hasattr(module, attr): + return getattr(module, attr) + # Some of the mappings have entries model_type -> object of another model type. In that case we try to grab the + # object at the top level. + transformers_module = importlib.import_module("transformers") + return getattribute_from_module(transformers_module, attr) + + + +class AutoDeltaModel: + r""" + """ + _delta_model_mapping = LAZY_DELTA_MAPPING + def __init__(self, *args, **kwargs): + # raise EnvironmentError( + # f"{self.__class__.__name__} is designed to be instantiated " + # f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or " + # f"`{self.__class__.__name__}.from_config(config)` methods." + # ) + + raise AttributeError( + f"{self.__class__.__name__} is designed to be instantiated using\n\t(1) `{self.__class__.__name__}.from_finetuned(finetuned_delta_path, backbone_model, *model_args, **kwargs)`\nor\t(2) `{self.__class__.__name__}.from_config(delta_config, backbone_model, **kwargs)`" + ) + + @classmethod + def from_config(cls, config, backbone_model, **kwargs) -> DeltaBase: + r"""Automatically instantiates a delta model based on the :obj:`config`. The delta model correspond to the delta + :obj:`config` will be loaded and initialized using the arguments in :obj:`config`. + + .. note:: + Only using :meth:`from_config` method will not load the finetuned weight file (e.g., pytorch_model.bin). + Please use from_finetuned directly. + + Args: + config (:obj:`BaseDeltaConfig`): + backbone_model (:obj:`nn.Module`): + + Examples: + + .. code-block:: python + + config = AutoDeltaConfig.from_finetuned("DeltaHub/lora_t5-base_mrpc") + delta_model = AutoDeltaModel.from_config(config, backbone_model) + + """ + if type(config) in cls._delta_model_mapping.keys(): + model_class = cls._delta_model_mapping[type(config)] + return model_class.from_config(config, backbone_model, **kwargs) + + raise ValueError( + f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" + f"Model type should be one of {', '.join(c.__name__ for c in cls._delta_model_mapping.keys())}." + ) + + @classmethod + def from_finetuned(cls, finetuned_delta_path, backbone_model, *model_args, **kwargs) -> DeltaBase: + r""" Automatically instantiated a delta model and load the finetuned checkpoints based on the + :obj:`finetuned_delta_path`, which can either be a string pointing to a local path or a url pointint to + the delta hub. It will check the hash after loading the delta model to see whether the correct backbone and + delta checkpoint are used. + + Args: + finetuned_delta_path (:obj:`str` or :obj:`os.PathLike`, *optional*): Can be either: + + - A string, the model name of a finetuned delta model configuration hosted inside a model repo on `Delta Center `_, like ``thunlp/FactQA_T5-large_Adapter``. + - A path to a directory containing a configuration file saved using the :meth:`~opendelta.utils.saving_loading_utils.SaveLoadMixin.save_finetuned` method, e.g., ``./my_model_directory/``. + - A path or url to a saved configuration JSON *file*, e.g., ``./my_model_directory/configuration.json``.The last two option are not tested but inherited from huggingface. + + backbone_model (:obj:`nn.Module`): The backbone model to be modified. + model_args: Other argument for initialize the model. See :`DeltaBase.from_finetuned` for details. + kwargs: Other kwargs that will be passed into DeltaBase.from_finetuned. See `DeltaBase.from_finetuned` for details. + + Example: + + .. code-block:: python + + delta_model = AutoDeltaModel.from_finetuned("thunlp/FactQA_T5-large_Adapter", backbone_model=5) + + """ + delta_config = kwargs.pop("delta_config", None) + + if not isinstance(delta_config, BaseDeltaConfig): + delta_config, kwargs = AutoDeltaConfig.from_finetuned( + finetuned_delta_path, return_unused_kwargs=True, **kwargs + ) + if type(delta_config) in cls._delta_model_mapping.keys(): + model_class = cls._delta_model_mapping[type(delta_config)] + return model_class.from_finetuned(finetuned_delta_path, backbone_model, *model_args, delta_config=delta_config, **kwargs) + raise ValueError( + f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" + f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}." + ) + + + + + +if __name__ == "__main__": + + config = AutoDeltaConfig.from_dict({"delta_type":"lora", "lora_r": 7}) + + + from transformers import AutoModelForSequenceClassification + model = AutoModelForSequenceClassification.from_pretrained("../../plm_cache/roberta-base/", num_labels=2) + # from IPython import embed + delta_model = AutoDeltaModel.from_config(config, model) + delta_model.freeze_module(exclude = ['deltas','classifier'], set_state_dict = True) + + + # delta_model.save_finetuned("autodelta_try", push_to_hub=True, private=True) + delta_model = AutoDeltaModel.from_finetuned("ShengdingHu/autodelta_try", model, use_auth_token=True) + + + + diff --git a/OpenDelta-0.3.2/opendelta/basemodel.py b/OpenDelta-0.3.2/opendelta/basemodel.py new file mode 100644 index 0000000..d5e335e --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/basemodel.py @@ -0,0 +1,784 @@ + + +from collections import OrderedDict +from multiprocessing.sharedctypes import Value +import os +from turtle import back +from opendelta.delta_configs import BaseDeltaConfig +from opendelta.utils.inspect import inspect_module_statistics +from opendelta.utils.model_md5 import gen_model_hash +from opendelta.utils.signature import get_arg_names, signature +from typing import Optional, Union +from opendelta.utils.cuda import get_device +from opendelta.utils.name_based_addressing import * +import torch.nn as nn +import torch +from functools import wraps +# from decorator import decorate +from opendelta.utils.decorate import decorate +from opendelta.utils.structure_mapping import transform +from transformers.file_utils import PushToHubMixin +from transformers.deepspeed import deepspeed_config, is_deepspeed_zero3_enabled +from opendelta import SaveLoadMixin +from opendelta import logging +from opendelta.utils.structure_mapping import CommonStructureMap +from opendelta.utils.interactive.web import interactive +from opendelta.utils.data_parallel import new_replicate_for_data_parallel +from opendelta.utils.cuda import move_dict_to_cuda +import sys + +from opendelta.utils.data_parallel import caller_map +from opendelta.utils.backend import BackendMapping +logger = logging.get_logger(__name__) + +def is_leaf_module(module): + r"""Whether the module is a leaf module + """ + return len([n for n,_ in module.named_children()]) == 0 + + + +def non_module_param(module: nn.Module): + module_names = [n for n, _ in module.named_modules()] + ret = [] + for n, p in module.named_parameters(): + if not is_child_key(n, module_names): + ret.append((n,p)) + return ret + + + + + +class DeltaBase(nn.Module, SaveLoadMixin): + r"""This is the base class for all delta models. It provides four simple but effective functionalities + for building the delta model: + + #. addressing a module inside the backbone model using a minimal description key. + #. provide the interface for modifying and inserting model which keeps the docs/IO the same as the module + before modification. + #. pass a pseudo input to determine the inter dimension of the delta models. + #. freeze a part of model parameters according to key. + + It also provides unified interface for model loading and saving. + + Class attributes (overridden by derived classes): + + - delta_type (:obj:`str`): the name of the delta modules, used to create the correct :class:`opendelta.AutoDeltaModel`. + - config_class (:class:`BaseDeltaConfig`): The corresponding config model + + + Args: + backbone_model (:obj:`nn.Module`, *required*): backbone model that the delta models are build opon. The modification to the + backbone model are in place. + modified_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules are subjected to update. + + .. note:: + leave this argument :obj:`None` will make the delta model return to the default setting, which add the delta + models to the position experimented the paper. In this setting, the common structure mapping is loaded to + addressing the corresponding modules. + exclude_modules (:obj:`str`, *optional*, default to :obj:`None`): The modules starts with these strings will be excluded in modification. + Note that currently only plain text (no regular expression) is supported. + unfrozen_modules (:obj:`str`, *optional*, default to :obj:`None`): The modules that are **not** frozen when freezing the main part of the model. + registraction_name (:obj:`str`, *optional*, default to ``"deltas"``): The root name of the delta models when + attached to the backbone model. + common_structure (:obj:`bool`, *optional*, default to :obj:`None`): Whether use the common structure mapping to specify the + modified_modules. i.e., if common_structure=True, then we use a common ["attn"] for attention module in different models. + We DO NOT recommend manually set ``common_structure`` to ``true`` by yourself unless you are using delta + among multiple backbones and don't want to modify the code. + + interactive_modify (:obj:`bool` or :obj:`int`, *optional*, default to :obj:`None`): Whether to use interactive modification. + By setting to :obj:`int` can specify the port of web server. + """ + delta_type = "" + default_modified_modules = [] + default_exclude_modules = ["lm_head"] + config_class = BaseDeltaConfig + default_unfrozen_modules = ["deltas"] + _need_pseudo_data = True + _supported_backends = ['hf'] + def __init__(self, + backbone_model: nn.Module, + modified_modules: Optional[List[str]] = None, + exclude_modules: Optional[List[str]] = None, + unfrozen_modules: Optional[List[str]] = None, + interactive_modify: Optional[Union[bool, int]] = False, + common_structure: Optional[bool] = False, + backend: Optional[str]= "hf", # select from ["hf", "bmt"] + ): + nn.Module.__init__(self) + # register the backbone model after init using self.__dict__ method to avoid adding backbone_model + # to the modules of the delta model. + self.__dict__["backbone_model"] = backbone_model + if modified_modules is None and exclude_modules is None: + if interactive_modify: + if isinstance(interactive_modify, bool) and interactive_modify==True: + self.modified_modules = interactive(backbone_model) + else: + self.modified_modules = interactive(backbone_model, port=interactive_modify) + self.common_structure = False + self.exclude_modules = self.default_exclude_modules + else: + self.modified_modules = self.default_modified_modules + self.common_structure = True + self.exclude_modules = self.default_exclude_modules + else: + if interactive_modify: + raise ValueError("Use modified_modules(or exclude modules) and interactive_modify at the same time is not supported") + if modified_modules is not None: + self.modified_modules = modified_modules + else: + self.modified_modules = self.default_modified_modules + if exclude_modules is not None: + self.exclude_modules = exclude_modules + else: + self.exclude_modules = self.default_exclude_modules + self.common_structure = common_structure + if self.common_structure: + self.structure_mapping = CommonStructureMap(self.backbone_model) + else: + self.structure_mapping = None + if unfrozen_modules is None: + self.unfrozen_modules = self.default_unfrozen_modules + else: + self.unfrozen_modules = unfrozen_modules + if self.common_structure and self.structure_mapping is None: + raise RuntimeError("Using common structure but the structure mapping is None") + if backend not in self._supported_backends: + raise RuntimeError("Currently, backend `{}` is not supported for `{}`".format(backend, self.__class__.__name__)) + self.backend = backend + self.backend_mapping = BackendMapping(backend) + + def forward(self, *args, **kwargs) -> RuntimeError: + r""" + .. warning:: + + Removed method. As the model is a delta model, which should be attached to a backbone model \ + and can't forward any data by itself. Please using the backbone model's forward function \ + after attach the delta model to the backbone. + """ + raise RuntimeError("This is a delta model, which should be attached to a backbone model \ + and can't forward any data by itself. Please using the backbone model's forward function \ + after attach the delta model to the backbone. ") + + @classmethod + def from_config(cls, config: Union[BaseDeltaConfig, dict], backbone_model: nn.Module, check_hash=True, **kwargs): + r"""Initialize a delta model from a config object or a dict containing the configs. To temperarily change + a value in the config, pass it through kwargs. If the config has a backbone model's hash, which means it is + a finetuned delta model's config, then we will compare the hash in the config and the newly caculated to ensure + the finedtuned delta model is trained on the passed backbone_model. Pass ``check_hash=False`` to disable the + checking. + + Args: + config (:obj:`BaseDeltaConfig` or :obj:`dict`) A config object or a dict that contains the necessary value to + initialize the delta model. + backbone_model (:obj:`nn.Module`) A pytorch module that will be pass into the delta model as the backbone + model. modifications will be made in place in the backbone model. + check_hash (:obj:`bool`, default to ``True``) Whether to check hash of the backbone model and the config's + backbone hash. + kwargs: Any configurations that are passed to update the config object. #TODO unit test needed. + """ + supported_keys = get_arg_names(cls.__init__) + get_arg_names(DeltaBase.__init__) + config_dict = config.to_dict() + for key in list(config_dict.keys()): + if key not in supported_keys: + config_dict.pop(key) + return cls(backbone_model, **config_dict) + + + def add_all_delta_to_backbone(self, + backbone: nn.Module, + modified_modules: List[str], + ) -> nn.Module: + r"""The main function to add delta models to the backbone model based on the :obj:`modified_modules`. + + + Args: + backbone_model (:obj:`nn.Module`, *required*) backbone model that the delta models are build opon. The + modification to the backbone model are in place. + modified_modules (:obj:`List[str]`, *optional*, default to :obj:`None`) The modules are subjected to update. + leave this argument :obj:`None` will make the delta model return to the default setting, which add the delta + models to the position experimented the paper. In this setting, the common structure mapping is loaded to + addressing the corresponding modules. + + Returns: + :obj:`nn.Module` The modified backbone model. + + """ + self.plm_total_params = sum(p.numel() for p in backbone.parameters()) + # create a new key list to avoid recursion. + backbone_key_list = [key for key, _ in backbone.named_modules()] + for key in backbone_key_list: + if self.find_key(key, modified_modules): + self.update_module(backbone, key) + if self._need_pseudo_data: + self._pseudo_data_to_instantiate(backbone) + + # mark the paratmers that are the delta parameters for easily displaying the delta_paramters. + self.mark_as_delta() + return backbone + + def _pseudo_data_to_instantiate(self, backbone: Optional[nn.Module]=None): + if self.structure_mapping is None: + self._pseudo_data_to_instantiate_module(backbone) + else: + for key in self.structure_mapping.matched_pairs: + if key == "": + submodule = backbone + else: + _, _, submodule = self.find_module(backbone, key) + self._pseudo_data_to_instantiate_module(submodule) + + def mark_as_delta(self, module: nn.Module=None,): + r"""[NODOC] Mark :obj:`module`'s all parameters as delta parameters by setting a ``_is_delta`` attribute to each of them. + Generally, it is used after creating the delta modules. By leaving module to :obj:`None`, it will mark all the parameters in the + delta model as ``_is_delta``. + + Args: + module (:obj:`nn.Module`): The module to mark as delta. + """ + if module is None: + module=self # all the parameters in the delta model. + for p in module.parameters(): + setattr(p, "_is_delta", True) + + def update_module(self, module: nn.Module, key: str): + r"""Update a module specified by :obj:`key`. The method is reimplemented in each specific delta model. + """ + raise NotImplementedError + + + def freeze_module(self, + module: Optional[nn.Module] = None, + exclude: Optional[List[str]] = None, + set_state_dict: Optional[bool]=True, + ): + r"""Freeze the parameters of plm. Leave the parameters in exclude untouched. + deltas module is filtered with ``_is_delta`` attributes because it may have parameter sharing to the main + model, (e.g., bias term) + + Args: + module (:obj:`nn.Module`, *optional*, default to :obj:`None`): The module of which some parts are frozen. + If left with :obj:`None`, the function will the self.backbone_model as the module to be frozen. + exclude (:obj:`List[str]`, *optional*, default to ``["deltas"]``): The parameters that don't need to + be freezed. Default to all the delta parameters. + set_state_dict (:obj:`bool`, *optional*, default to :obj:`True`): Whether setting the backbone model's state + dict to all the parameters that still need grad. + prefix (:obj:`str`, *optional*, default to ``""``): A parameters that are used for recursive frozen. + Should not be changed by passing argument other than ``""``. + + """ + if exclude is None: + exclude = self.unfrozen_modules + + if module is None: + module = self.backbone_model + self._freeze_module_recursive(module, exclude, "") # modify the active state dict that still need grad + if set_state_dict: + self.set_active_state_dict(module) + + def _freeze_module_recursive(self, + module: Optional[nn.Module] = None, + exclude: Optional[List[str]] = None, + prefix=""): + r"""[NODOC] Freeze the parameters of plm. Leave the parameters in exclude untouched. + deltas module is filtered with ``_is_delta`` attributes because it may have parameter sharing to the main + model, (e.g., bias term) + + Args: + module (:obj:`nn.Module`, *optional*, default to :obj:`None`): The module of which some parts are frozen. + If left with :obj:`None`, the function will the self.backbone_model as the module to be frozen. + exclude (:obj:`List[str]`, *optional*, default to ``["deltas"]``): The parameters that don't need to + be freezed. Default to all the delta parameters. + set_state_dict (:obj:`bool`, *optional*, default to :obj:`True`): Whether setting the backbone model's state + dict to all the parameters that still need grad. + prefix (:obj:`str`, *optional*, default to ``""``): A parameters that are used for recursive frozen. + Should not be changed by passing argument other than ``""``. + + """ + + if is_leaf_module(module): + for n, p in module.named_parameters(): + next_prefix = n if prefix == "" else ".".join([prefix,n]) + if self.find_key(next_prefix, exclude): + continue + if "deltas" not in exclude or (not (hasattr(p, "_is_delta") and getattr(p, "_is_delta"))): + p.requires_grad = False + return + else: + for n, c in module.named_children(): + next_prefix = n if prefix == "" else ".".join([prefix,n]) + if self.find_key(next_prefix, exclude): # if found, untouch the parameters + continue + else: # firstly freeze the non module params, then go deeper. + params = non_module_param(module) + for n, p in params: + if "deltas" not in exclude or (not (hasattr(p, "_is_delta") and getattr(p, "_is_delta"))): + p.requires_grad = False + self._freeze_module_recursive(c, exclude=exclude, prefix=next_prefix) + + + + + + def find_key(self, key: str, target_list: List[str]): + r"""Check whether any target string is in the key or in the tail of the key, i.e., + + Args: + key (:obj:`str`): The key (name) of a submodule in a ancestor module. + E.g., model.encoder.layer.0.attention + target_list (List[Union[:obj:`str`, :obj:`re.Pattern`]]): The target list that we try to match ``key`` with. E.g., ["attention"] + + Returns: + :obj:`bool` True if the key matchs the target list. + """ + for x in self.exclude_modules: + if key.startswith(x): # start with the excluded key + return False + virtual_key, in_virtual_order = None, None + if self.structure_mapping is not None: + key, virtual_key, in_virtual_order = self.structure_mapping.transform(key, strict=False) + # currently in_virtual_order not in use, it means that if the common structure designate adding adapter to FFN, it will be add to all submodule of FFN. + if not key: + return False + if virtual_key is None: + return endswith_in(key, target_list) + else: + return endswith_in(key, target_list) or endswith_in(virtual_key, target_list) + + + def _pseudo_data_to_instantiate_module(self, module: Optional[nn.Module]=None): + r"""Some delta model requires a pseudo-data be passed through the model to understand the dimensionality of each tensor in the computation graph. + + (1) The model in the Huggingface Transformers library usually has the so-called `dummy_inputs`. We will make use of it. + (2) If the model does not have `dummy_inputs`, we will try to create it and throw a warning. + (3) If we encounter an error in (2), we will suggest you to create it by passing the dummy_inputs variable. + + Args: + module (:obj:`nn.Module`, *optional*, default to :obj:`None`): The backbone model. + + """ + if module is None: + module = self.backbone_model + device = get_device(module) + _auto_dummy = False + try: + dummy_inputs = module.dummy_inputs + dummy_inputs = move_dict_to_cuda(dummy_inputs, device) + except AttributeError: + logger.warning(f"No `dummy_inputs` attribute in {module.__class__.__name__} , automatically create `dummy_inputs`. Very likely to encounter error. To set dummy_inputs for your model, please use: `setattr(backbone_model, 'dummy_inputs', some_dummy_inputs)` before initializing `{self.__class__.__name__}`") + _auto_dummy = True + pass + if _auto_dummy: + _most_simple_input = torch.tensor([[0,0]]).to(device) + if "decoder_input_ids" in signature(module.forward).args: + dummy_inputs = {"input_ids": _most_simple_input, "decoder_input_ids": _most_simple_input} + else: + dummy_inputs = {"input_ids": _most_simple_input} + + _auto_dummy_fail = False + try: + module(**dummy_inputs) + except Exception as e: + _auto_dummy_fail = True + + if _auto_dummy_fail and _auto_dummy: + raise AttributeError(f"str({e})\n\tThe {self.__class__.__name__} requires a dummy_inputs to be passed through the model to understand the dimensionality of each tensor in the computation graph. \n\t The {module.__class__.__name__} Class has no dummy_inputs, and automatically created dummy_inputs failed.\n\t Refer to `https://opendelta.readthedocs.io/en/latest/notes/faq.html` for detail.") + + + + + + def trainable_parameters_names(self, module: Optional[nn.Module]=None): + r"""[NODOC] A small sugar function to return all the trainable parameter's name in the (by default, backbone) model. + + Args: + module (:obj:`nn.Module`): of which module we want to know the trainable paramemters' name. + + Returns: + :obj:`List[str]` + """ + if module is None: + module = self.backbone_model + return [n for n,p in module.named_parameters() if p.requires_grad] + + def frozen_parameters_names(self, module: Optional[nn.Module]=None): + r"""[NODOC] A small sugar function to return all the frozen parameters' name in the (by default, backbone) model. + + Args: + module (:obj:`nn.Module`): of which module we want to know the frozen paramemters' name. + + Returns: + :obj:`List[str]` + """ + if module is None: + module = self.backbone_model + return [n for n,p in module.named_parameters() if not p.requires_grad] + + def trainable_parameters(self,module: Optional[nn.Module]=None): + r"""[NODOC] A small sugar function to return all the frozen parameters in the (by default, backbone) model. + + Args: + module (:obj:`nn.Module`): of which module we want to know the frozen paramemters. + + Returns: + :obj:`List[nn.Parameter]` + """ + if module is None: + module = self + return [p for n,p in module.named_parameters() if p.requires_grad] + + + def num_trainable_parameters(self, module: Optional[nn.Module]=None): + r"""[NODOC] A small sugar function to get the number of trainable parameter in the backbone model. Often used to + compute the trainable rate. + + Args: + module (:obj:`nn.Module`): of which module we want to know the number of trainable paramemters. + + Returns: + :obj:`List[nn.Parameter]` + """ + if module is None: + module = self + pnum_tot = 0 + for param in module.parameters(): + if param.requires_grad: + pnum_tot += param.numel() + return pnum_tot + + def num_total_parameters(self, module: Optional[nn.Module]=None): + r"""[NODOC] A small sugar function to get the number of trainable parameter in the backbone model. Often used to + compute the trainable rate. + + Args: + module (:obj:`nn.Module`): of which module we want to know the number of trainable paramemters. + + Returns: + :obj:`List[nn.Parameter]` + """ + if module is None: + module = self + pnum_tot = 0 + for param in module.parameters(): + pnum_tot += param.numel() + return pnum_tot + + + + def find_module(self, root_module: nn.Module, key:str): + r"""Find the module using a key and the root module. Return both the parent reference, the child name and reference. + + Args: + root_module (:obj:`root_module`): The root_module to find the sub module in + key (:obj:`str`): The relative key to the root module. + + Returns: + (:obj:`nn.Module`, :obj:`str`, :obj:`nn.Module`): + * A reference to the parent module of the target module, mainly for substuting the target module. + * The key of the target module relevant to its parent module + * Target module. + """ + sub_keys = key.split(".") + parent_module = root_module + for sub_key in sub_keys[:-1]: + parent_module = getattr(parent_module, sub_key) + module = getattr(parent_module, sub_keys[-1]) + return parent_module, sub_keys[-1], module + + def _register_delta_infos(self, parent_module, _delta_info): + r"""Register the delta infomation. + Automatically incrementing the suffix for repeated delta_names + """ + _delta_infos = getattr(parent_module, "_delta_infos", []) + if len(_delta_infos) > 0: # check if duplicated name + list_of_deltas = [d['delta_name'] for d in _delta_infos] + cur_name = _delta_info['delta_name'] + if cur_name in list_of_deltas: + cur_name = cur_name + "_1" + counter = 1 + while cur_name in list_of_deltas: + counter += 1 + cur_name = cur_name.split("_")[0] + "_"+str(counter) + _delta_info["delta_name"] = cur_name + _delta_infos.append(_delta_info) + setattr(parent_module, "_delta_infos", _delta_infos) + + def replace_module(self, + parent_module: nn.Module, + child_name: str, + child_module: nn.Module, + new_module: nn.Module, + delta_name: Optional[str] = "delta", + ): + r"""Replace a module's child module with the new_module(a delta module). Used by delta method based on direct + replacement, such as :class:`opendelta.delta_modules.lora.LoraModel`. + + Args: + parent_module (:obj:`nn.Module`): The parent module of the replacement. + child_name (:obj:`str`): The chird module's name, i.e., parent_module.child_name give us child_module + child_module (:obj:`nn.Module`): The original child module. + new_module (:obj:`nn.Module`): The delta module. + delta_name (:obj:`str`, *optional*, default ot ``delta``): The name of the delta module, used for recording. + parent_module.delta_name WILL NOT give you the delta module. + """ + self.delta_modules.append(new_module) + setattr(parent_module, child_name, new_module) + # register delta info + _delta_info = {"method": "replace", + "delta_module": new_module, + "child_name": child_name, + "org_module": child_module, + "delta_name": delta_name, + "delta_belong": self, + "state": "on"} + self._register_delta_infos(parent_module=parent_module, + _delta_info = _delta_info, + ) + + + def modify_module(self, module: nn.Module): + r"""Modify the inside parameteres of a module. This method will be reimplemented in different + derived class if needed. + """ + raise NotImplementedError + + def insert_module(self, module, method='sequential', delta_module=None, delta_name='delta', strict=False, _delta_info=None): + r"""insert a module (previous not exists in the code base) before/after a module. Specifically, it modifies the forward + function of the original module to firstly pass the arguments into the new module's forward function and then pass + it into the original ones. The new module can also be inserted after the original module with similar mechanism. + + When implementing the new module , researchers should be aware of the components of arguments of the original module's forward function. + + Args: + module: (:obj:`nn.Module`): The (sub)module to inserted a delta module. + delta_module: (:obj:`DeltaBase`): The delta module to be inserted. + name: (:obj:`str`, *optional*): The name of the delta in the backbone module. + strict: (:obj:`bool`, *optional*): Whether to prohibit modify a modified module. + _delta_info (:obj:`Dict`, *optional*): Used in attach(), reattach a delta module to backbone. The info of + original delta is passed through ``_delta_info``. + + """ + + + if strict: + if hasattr(module.forward, "__wrapped__"): + raise RuntimeWarning("The forward function might have been wrapped by a decorator, is it intended?") + + # record info for plug and unplug and nested wrap + if _delta_info is None: + if delta_module is None: + raise RuntimeError("delta module can't be none to ensure successful replicate of the parent module.") + + _delta_info = {"method": method, + "delta_module": delta_module, + "delta_name": delta_name, + "delta_belong": self, + "state": "on"} + self._register_delta_infos(parent_module=module, + _delta_info = _delta_info) + else: + delta_module = _delta_info["delta_module"] + delta_name = _delta_info["delta_name"] + + setattr(module, _delta_info['delta_name'], _delta_info["delta_module"]) + + + if _delta_info["method"] in caller_map.keys(): + caller = caller_map[_delta_info["method"]] + new_forward = decorate(module.forward, caller, extras=(module, _delta_info['delta_name']), kwsyntax=True) # decorator.decorate helps preserving the functions metadata (signature, etc.). + module.forward = new_forward.__get__(module, type(module)) # func.__get__(object, type(object)) register a function as an object's method + # for DataParallel's copy behavior. Experimental: + # may have bugs when module.forward is nestedly wrapped. + module._replicate_for_data_parallel = new_replicate_for_data_parallel.__get__(module, type(module)) + else: + raise NotImplementedError(f"_delta_info['method']=='{_delta_info['method']}' is not supported") + + + def insert_sequential_module(self, module, delta_module=None, delta_name='delta', strict=False, _delta_info=None): + r"""insert a module (previous not exists in the code base) before/after a module. Specifically, it modifies the forward + function of the original module to firstly pass the arguments into the new module's forward function and then pass + it into the original ones. The new module can also be inserted after the original module with similar mechanism. + + When implementing the new module , researchers should be aware of the components of arguments of the original module's forward function. + + Args: + module: (:obj:`nn.Module`): The (sub)module to inserted a delta module. + delta_module: (:obj:`DeltaBase`): The delta module to be inserted. + name: (:obj:`str`, *optional*): The name of the delta in the backbone module. + strict: (:obj:`bool`, *optional*): Whether to prohibit modify a modified module. + _delta_info (:obj:`Dict`, *optional*): Used in attach(), reattach a delta module to backbone. The info of + original delta is passed through ``_delta_info``. + + """ + self.insert_module(module, "sequential", delta_module, delta_name, strict, _delta_info) + + + def insert_parallel_module(self, module, delta_module=None, delta_name='delta', strict=False, _delta_info=None): + """insert a module (previous not exists in the code base) across a module. Specifically, it modifies the forward + function of the original module to firstly pass the arguments into the delta model's forward function and set + aside the calculation result. Then combine it with the calculation result output from the backbone module. + + When implementing the new module , researchers should be aware of the arguments and keywards of the original module's forward function. + + Args: + module: (:obj:`nn.Module`): The (sub)module to inserted a delta module. + delta_module: (:obj:`DeltaBase`): The delta module to be inserted. + name: (:obj:`str`, *optional*): The name of the delta in the backbone module. + strict: (:obj:`bool`, *optional*): Whether to prohibit modify a modified module. + _delta_info (:obj:`Dict`, *optional*): Used in attach(), reattach a delta module to backbone. The info of + original delta is passed through ``_delta_info``. + + """ + + self.insert_module(module, "parallel", delta_module, delta_name, strict, _delta_info) + + + def set_active_state_dict(self, module: nn.Module): + r"""modify the state_dict function of the model (by default, the backbone model) to return only the tunable part. + + Args: + module (:obj:`nn.Module`): The module modified. The modification is in-place. + """ + def _caller(_org_func, includes, *args, **kwargs): + state_dict = _org_func(*args, **kwargs) + keys = list(state_dict.keys()) + for n in keys: + if n not in includes: + state_dict.pop(n) + return state_dict + includes = self.trainable_parameters_names(module) # use excludes will have trouble when the model have shared weights + if hasattr(module.state_dict, "__wrapped__"): + raise RuntimeWarning("The forward function might have been wrapped by a decorator, is it intended? Do you freeze the parameters twice?") + module.state_dict = decorate(module.state_dict, _caller, extras=(includes,), kwsyntax=True) # decorator.decorate helps preserving the functions metadata (signature, etc.). + + def _load_state_dict_into_backbone(self, backbone_model: nn.Module = None, state_dict: dict = {}): + r"""[NODOC] + """ + if backbone_model is None: + backbone_model = self.backbone_model + self.backbone_model.load_state_dict(state_dict, strict=False) + + def create_config_from_model(self, ): + r"""[NODOC] If the delta model was built by directly passing arguments, instead of passing a config object. + create the config of the delta model for saving the delta model. + """ + # common_attributes + config = self.config_class() + config_keys = signature(config.__init__)[0] + signature(super(self.config_class, config).__init__)[0] + + for key in config_keys: + val = getattr(self, key) if hasattr(self, key) else None + setattr(config, key, val) + config.delta_type = self.delta_type + self.config = config + + + def log(self, module=None, delta_ratio=True, trainable_ratio=True, visualization=True, cuda_memory=True): + r"""Log and visualize the result of applying delta. + Possible Options are ``trainable_ratio``, + ``visualization``, ``delta_ratio``. + + Args: + delta_ratio (:obj:`bool`, *optional*): Whether computing the ratio of parameters in the delta modules. + trainable_ratio (:obj:`bool`, *optional*): Whether computing the ratio of trainable parameters. + visualization (:obj:`bool`, *optional*): Whether visualize the parameter information of the modified backbone. + + """ + if module is None: + module = self.backbone_model + + + if visualization: + from bigmodelvis import Visualization + Visualization(module).structure_graph() + + self.stat = inspect_module_statistics(module, verbose=False) + if trainable_ratio: + logger.info("Trainable Ratio: {}/{}={:.6f}%".format(self.stat['trainable_parameters'], self.stat['total_parameters'], self.stat['trainable_ratio']*100)) + if delta_ratio: + logger.info("Delta Parameter Ratio: {}/{}={:.6f}%".format(self.stat['delta_parameters'], self.stat['total_parameters'],self.stat['delta_ratio']*100)) + if cuda_memory: + logger.info("Static Memory {:.2f} GB, Max Memory {:.2f} GB".format(self.stat['cudamem'], self.stat['maxcudamem'])) + + + + + # Two functions for plug and remove the delta model. + def attach(self, module: Optional[nn.Module]=None, reset_state_dict=True): + r"""Reattach the delta modules to the backbone. Note that this method can not be used to create new delta modules. + Instead, a :meth:`DeltaBase.detach` should precede this method. + + Args: + module (:obj:`object`, *optional*, default to :obj:`None`): The backbone module that we + reattach the deltas to. + """ + + if module is None: + module = self.backbone_model + + for name, submodule in module.named_modules(): + if hasattr(submodule, "_delta_infos"): + _delta_infos = getattr(submodule, "_delta_infos") + for _delta_info in _delta_infos: + if _delta_info['delta_belong'] is not self: + continue + if _delta_info["state"] == "on": + continue + + if _delta_info['method'] == "replace": + setattr(submodule, _delta_info["child_name"], _delta_info['delta_module']) + elif _delta_info['method'] == "insert_sequential": + self.insert_sequential_module(module=submodule, + _delta_info=_delta_info) + elif _delta_info['method'] == "insert_parallel": + self.insert_parallel_module(module=submodule, + _delta_info=_delta_info) + else: + raise NotImplementedError + + _delta_info['state'] = "on" + if reset_state_dict: + self.set_active_state_dict(module) + + + + def detach(self, module: Optional[nn.Module]=None, reset_state_dict=True): + r"""Detach the delta module from the backbone. The delta module is not deleted, but temporarily turned off. + Use :meth:`DeltaBase.attach` to reattach the delta model to the backbone. + + Args: + module (:obj:`object`, *optional*, default to :obj:`None`): The backbone module that we + detached the deltas from. + """ + + if module is None: + module = self.backbone_model + + for name, submodule in module.named_modules(): + if hasattr(submodule, "_delta_infos"): + _delta_infos = getattr(submodule, "_delta_infos") + for _delta_info in _delta_infos: + if _delta_info['delta_belong'] is not self: + continue + if _delta_info["state"] == "off": + continue + + if _delta_info['method'] == "replace": + setattr(submodule, _delta_info["child_name"], _delta_info['org_module']) + elif _delta_info['method'] in ["sequential", "before", "after", "parallel"]: + if hasattr(submodule.forward, "__wrapped__"): + submodule.forward = submodule.forward.__wrapped__ + delattr(submodule, _delta_info["delta_name"]) + else: + raise AttributeError("submodule {}'s forward has no attribute __wrapped__. It's not a wrapped function.".format(name)) + else: + raise NotImplementedError + + _delta_info['state'] = "off" + if reset_state_dict: + try: + module.state_dict = module.state_dict.__wrapped__ + except AttributeError: + pass + diff --git a/OpenDelta-0.3.2/opendelta/delta_configs.py b/OpenDelta-0.3.2/opendelta/delta_configs.py new file mode 100644 index 0000000..ca722e4 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/delta_configs.py @@ -0,0 +1,362 @@ +import os +import re +from typing import Union, Dict, Any, Tuple, Optional +from opendelta import __version__ as opendelta_version +from opendelta.utils import logging +from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func +import transformers +import json +import copy + +CONFIG_NAME = "config.json" +transformers_version = transformers.__version__ + +checked_package_versions = ["transformers_version", "opendelta_version"] + +logger = logging.get_logger(__name__) +FULL_CONFIGURATION_FILE = "config.json" +_re_configuration_file = re.compile(r"config\.(.*)\.json") + +class BaseDeltaConfig: + r"""Base class for all configuration classes. Handles a few + parameters common to all delta models' configurations as well as methods for loading/downloading/saving configurations. + + Class attributes (overridden by derived classes): + + - **delta_type** (:obj:`str`) -- the name of the delta modules, used to create the correct :py:class:`~opendelta.AutoConfig`. + + Args: + modified_modules (:obj:`List[str]`, *optional*, defaults to :obj:`None`) + The list of keys to determine which modules you want to modify. OpenDelta will take every modulees that + **ends with** the one of the provided keys as the modification target. When not given any value, i.e. + ``modified_modules=None``, the delta module will use the it corresponding default modification modules. + Taking DistilBertModel with an classifier on top as an example: + + .. note:: + + **Examples**: When adding delta to `DistilBertModel `_, + + 1. set to ``["0.attention.out_lin"]`` will add delta modules to the attention output of distilbert's layer 0, i.e., ``distilbert.transformer.layer.0.attention.out_lin``. + 2. set to ``["attention.out_lin"]`` will add the delta modules in every layer's ``attention.out_lin``. + + unfrozen_modules (:obj:`List[str]`, *optional*, defaults to :obj:`["deltas"]` ): The modules that are unfrozen + during training in :meth:`~opendelta.basemodel.DeltaBase.freeze_module`, which includes the ones that are newly introduced as delta modules, and the ones that are originally a part of the model but set to trainable (:obj:`requires_grad=True`) to train together with the delta modules. Opendelta will take every modules that **ends with** the one of the provided keys and all its sub-modules and paramters as trainable. + + exclude_modules (:obj:`str`, *optional*, default to :obj:`None`): The modules starts with these strings will be excluded in modification. Note that currently only plain text (no regular expression) is supported. + + .. note:: + + **Examples**: When adding delta to DistilBertModel, + + 1. set this argument to ``["bias"]`` will make all bias terms tunable. + 2. set this argument to ``["attention"]`` will make all parameters in all attention modules tunable. + 3. set this argument to ``["deltas"]`` will make all the parameters in the newly introduced delta modules tunable. + 4. set this argument to ``["classifier"]`` will make all parameters in the classifier tunable. + 5. set this argument to ``["3.ffn.lin2", "deltas", "classifier"]``, will make all parameters in the third layer's feed forward layer's send linear layer, the detla modules, and the classifiers modules tunable. + + common_structure (:obj:`bool`, *optional*, default to :obj:`None`): Whether using the common structure mapping of the transformer model when designating ``modified_modules` and ``unfrozen_modules``. + backbone_class (:obj:`str`, *optional*, default to :obj:`None`): The name of backbone model's class, e.g. + ``RobertaForMaskedLM``. Saving this infomation let the users explicitly know on which backbone the + delta model is trained. + backbone_checkpoint_name (:obj:`str`, *optional*, default to :obj:`None`): The specific checkpoint of the model. + In ideal case, it should be the url to download the checkpoint. However, we do not force the user to + specify a downloadable url here. + backbone_hash (:obj:`str`, *optional*, default to :obj:`None`): The md5-hash of the backbone model. It is + calculated using the string representation of the model and the sequential expansion of all the + parameters in the model. When loading a delta checkpoint in strict mode, the hash of the backbone model + will be compared to the hash in this config. + """ + delta_type: str = "" + + + def __init__(self, + modified_modules = None, + exclude_modules = None, + unfrozen_modules = ["deltas"], + common_structure=False, + backbone_class = None, + backbone_checkpoint_name = None, + backbone_hash = None, + ): + arg_names = get_arg_names(BaseDeltaConfig.__init__) + for arg_name in arg_names: + setattr(self, arg_name, locals()[arg_name]) + + + + + @classmethod + def from_finetuned(cls, finetuned_delta_path: Union[str, os.PathLike], **kwargs) -> "BaseDeltaConfig": + r""" + Instantiate a :obj:`BaseDeltaConfig` (or a derived class) from a finetined delta module configuration. + + Args: + finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): This can be either: + + - a string, the *model id* of a finetuned delta model configuration hosted inside a model repo on + deltahub.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or + namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. + + - a path to a *directory* containing a configuration file saved using the :meth:`BaseDeltaConfig.save_finetuned` method, e.g., ``./my_model_directory/``. + + - a path or url to a saved configuration JSON *file*, e.g., ``./my_model_directory/configuration.json``. + + cache_dir (:obj:`str` or :obj:`os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained delta model configuration should be cached if the + standard cache should not be used. + + .. code-block:: python + + delta_config = AdapterConfig.from_finetuned("thunlp/FactQA_T5-large_Adapter", backbone_model=t5) + + """ + config_dict, kwargs = cls.get_config_dict(finetuned_delta_path, **kwargs) + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warn( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + def save_finetuned(self, save_directory: Union[str, os.PathLike], **kwargs): + """ + Save a configuration object to the directory :obj:`save_directory`, so that it can be re-loaded using the + :meth:`BaseDeltaConfig.from_finetuned` class method. + + Args: + save_directory (:obj:`str` or :obj:`os.PathLike`): Directory where the configuration JSON file + will be saved (will be created if it does not exist). + push_to_hub (:obj:`bool`, *optional*, defaults to :obj:`False`): Whether or not to push your model to + the Hugging Face model hub after saving it. + + .. warning:: + + 1. Will raise error if you haven't config a Huggingface Model Hub. + 2. Using ``push_to_hub=True`` will synchronize the repository you are pushing to with ``save_directory``, which requires ``save_directory`` to be a local clone of the repo you are pushing to if it's an existing folder. Pass along ``temp_dir=True`` to use a temporary directory instead. + + kwargs: Additional key word arguments. + """ + if os.path.isfile(save_directory): + raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") + + os.makedirs(save_directory, exist_ok=True) + # If we save using the predefined names, we can load using `from_pretrained` + output_config_file = os.path.join(save_directory, CONFIG_NAME) + + self.to_json_file(output_config_file, use_diff=True) + logger.info(f"Configuration saved in {output_config_file}") + + + @classmethod + def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "BaseDeltaConfig": + r""" + Instantiate a :obj:`BaseDeltaConfig` from a python dictionary of parameters. + + Args: + config_dict (:obj:`Dict[str, Any]`): + Dictionary that will be used to instantiate the configuration object. Such a dictionary can be + retrieved from a pretrained checkpoint by leveraging the :py:meth:`~PretrainedConfig.get_config_dict` method. + kwargs (:obj:`Dict[str, Any]`): + Additional parameters from which to initialize the configuration object. + Returns: + :obj:`BaseDeltaConfig`: The configuration object instantiated from those parameters. + """ + return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) + accept_args = get_arg_names(cls.__init__) + get_arg_names(BaseDeltaConfig.__init__) + unused_config_keys = [] + for config_key in list(config_dict.keys()): + if config_key not in accept_args: + config_dict.pop(config_key) + unused_config_keys.append(config_key) + logger.warning(f"The following keys are not used by {cls}.__init__ function: {unused_config_keys}") + + config = cls(**config_dict) + + + # Update config with kwargs if needed + to_remove = [] + for key, value in kwargs.items(): + if hasattr(config, key): + + setattr(config, key, value) + if key != "torch_dtype": + to_remove.append(key) + for key in to_remove: + kwargs.pop(key, None) + logger.info(f"Model config\n{config}") + + if return_unused_kwargs: + return config, kwargs + else: + return config + + @classmethod + def get_config_dict( + cls, finetuned_delta_path: Union[str, os.PathLike], **kwargs + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """[NODOC] + From a ``finetuned_delta_path``, resolve to a dictionary of parameters, to be used for instantiating a + [``PretrainedConfig``] using ``from_dict``. + Parameters: + finetuned_delta_path (:obj:`str` or :obj:`os.PathLike`): + The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. + Returns: + :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object. + """ + cache_dir = kwargs.get("cache_dir", None) + force_download = kwargs.get("force_download", False) + # resume_download = kwargs.pop("resume_download", False) + # proxies = kwargs.pop("proxies", None) + # use_auth_token = kwargs.pop("use_auth_token", None) + local_files_only = kwargs.get("local_files_only", False) + # revision = kwargs.pop("revision", None) + # from_pipeline = kwargs.pop("_from_pipeline", None) + # from_auto_class = kwargs.pop("_from_auto", False) + + # user_agent = {"file_type": "config", "from_auto_class": from_auto_class} + # if from_pipeline is not None: + # user_agent["using_pipeline"] = from_pipeline + + if os.environ.get("DELTACENTER_OFFLINE", '0') == '1': + logger.info("Delta Center offline mode!") + local_files_only = True + + finetuned_delta_path = str(finetuned_delta_path) + + if cache_dir is not None: + cached_finetuned_delta_path = os.path.join(cache_dir, finetuned_delta_path) + else: + cached_finetuned_delta_path = finetuned_delta_path + + if os.path.isfile(cached_finetuned_delta_path): + local_files_only = True + elif os.path.isdir(cached_finetuned_delta_path): + # cached_finetuned_delta_path = os.path.join(cached_finetuned_delta_path, 'config.json') + local_files_only = True + + # if local_files_only: + # config_dict = cls._dict_from_json_file(cached_finetuned_delta_path) + if not local_files_only or force_download: + from .utils.delta_center import download as dcdownload + # try to download from DeltaCenter + cached_finetuned_delta_path = dcdownload(finetuned_delta_path, force_download=force_download, cache_dir=cache_dir) + kwargs['force_download'] = False # Has been downloaded, not more forcing + + cached_finetuned_delta_path = os.path.join(cached_finetuned_delta_path, 'config.json') + config_dict = cls._dict_from_json_file(cached_finetuned_delta_path) + return config_dict, kwargs + + @classmethod + def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]): + with open(json_file, "r", encoding="utf-8") as reader: + text = reader.read() + return json.loads(text) + + def __repr__(self): + return f"{self.__class__.__name__} {self.to_json_string()}" + + def __eq__(self, other): + return self.__dict__ == other.__dict__ + + def to_json_string(self, use_diff: bool = True) -> str: + """[NODOC] + Serializes this instance to a JSON string. + Args: + use_diff (:obj:`bool`, *optional*, defaults to :obj:`True`): + If set to :obj:`True`, only the difference between the config instance and the default ``PretrainedConfig()`` + is serialized to JSON string. + Returns: + :obj:`str`: String containing all the attributes that make up this configuration instance in JSON format. + """ + if use_diff is True: + config_dict = self.to_diff_dict() + else: + config_dict = self.to_dict() + return json.dumps(config_dict, indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True): + """[NODOC] + Save this instance to a JSON file. + Args: + json_file_path (:obj:`str` or :obj:`os.PathLike`): + Path to the JSON file in which this configuration instance's parameters will be saved. + use_diff (:obj:`bool`, *optional*, defaults to :obj:`True`): + If set to :obj:`True`, only the difference between the config instance and the default ``PretrainedConfig()`` + is serialized to JSON file. + """ + with open(json_file_path, "w", encoding="utf-8") as writer: + writer.write(self.to_json_string(use_diff=use_diff)) + + def to_diff_dict(self) -> Dict[str, Any]: + """[NODOC] + Removes all attributes from config which correspond to the default config attributes for better readability and + serializes to a Python dictionary. + Returns: + :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, + """ + config_dict = self.to_dict() + + # get the default config dict + default_config_dict = BaseDeltaConfig().to_dict() + + # get class specific config dict + class_config_dict = self.__class__().to_dict() #if not self.is_composition else {} + + serializable_config_dict = {} + + # only serialize values that differ from the default config + for key, value in config_dict.items(): + if ( + key not in default_config_dict + or key in checked_package_versions + or value != default_config_dict[key] + or (key in class_config_dict and value != class_config_dict[key]) + ): + serializable_config_dict[key] = value + + self.dict_torch_dtype_to_str(serializable_config_dict) + + return serializable_config_dict + + def update(self, config_dict: Dict[str, Any]): + """[NODOC] + Updates attributes of this class with attributes from ``config_dict``. + Args: + config_dict (:obj:`Dict[str, Any]`): Dictionary of attributes that should be updated for this class. + """ + for key, value in config_dict.items(): + setattr(self, key, value) + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes this instance to a Python dictionary. + """ + output = copy.deepcopy(self.__dict__) + if hasattr(self.__class__, "model_type"): + output["model_type"] = self.__class__.model_type + + # Transformers version when serializing the model + output["transformers_version"] = transformers_version + output["opendelta_version"] = opendelta_version + + self.dict_torch_dtype_to_str(output) + + return output + + def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None: + """[NODOC] + Checks whether the passed dictionary has a *torch_dtype* key and if it's not None, converts torch.dtype to a + string of just the type. For example, ``torch.float32`` get converted into *"float32"* string, which can then be + stored in the json format. + """ + if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str): + d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1] + + + + +if __name__ == "__main__": + myconfig = BaseDeltaConfig.from_pretrained("../ckpts/lora/") + myconfig.save_pretrained("../ckpts/lora.1/") + print(myconfig) \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/delta_models/__init__.py b/OpenDelta-0.3.2/opendelta/delta_models/__init__.py new file mode 100644 index 0000000..c57b864 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/delta_models/__init__.py @@ -0,0 +1,2 @@ +from .lora import LoraModel, LoraConfig +from .bitfit import BitFitModel diff --git a/OpenDelta-0.3.2/opendelta/delta_models/adapter.py b/OpenDelta-0.3.2/opendelta/delta_models/adapter.py new file mode 100644 index 0000000..4f191b4 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/delta_models/adapter.py @@ -0,0 +1,252 @@ + +from typing import Optional, Union +from opendelta.utils.signature import get_arg_names_inside_func +from opendelta.utils.name_based_addressing import * +from opendelta.utils.cuda import get_device +from opendelta.basemodel import DeltaBase +import torch.nn as nn +import torch +from opendelta.delta_models.layers.activations import Activations +from opendelta import BaseDeltaConfig +import opendelta.utils.logging as logging +import numpy as np +from opendelta import global_setting +from dataclasses import dataclass, field + +logger = logging.get_logger(__name__) + + +class InterFaceMixin: + def __init__(self): + self._axis_order = global_setting.axis_order + self._reverse_axis_order = np.argsort(self._axis_order).tolist() + + def _transpose(self, tensor): + if tensor.dim() == 3: + return tensor.permute(*self._axis_order) + else: + return tensor + + + + def _reverse_transpose(self, tensor): + if tensor.dim() == 3: + return tensor.permute(*self._reverse_axis_order).contiguous() + else: + return tensor + + def _convert_data_type(self, tensor): + self._data_type_record = tensor.dtype + self._device_record = tensor.device + return tensor.to(torch.float32).to(self._get_device()) + + def _reverse_data_type(self, tensor): + return tensor.to(self._data_type_record).to(self._device_record) + + + + + +class AdapterLayer(nn.Module, InterFaceMixin): + r"""A layer of adapter tuning module. + """ + layer_count = 0 + + @classmethod + def count_layer(cls): + cls.layer_count += 1 + + @classmethod + def get_layer_count(cls): + return cls.layer_count + + def __init__(self, bottleneck_dim=24, non_linearity='gelu_new', device=None, backend="hf"): + super().__init__() + InterFaceMixin.__init__(self) + self.bottleneck_dim = bottleneck_dim + self.init_device = device + self.instantiated = False + self.non_linearity = non_linearity + self.backend=backend + + self.layer_id = AdapterLayer.get_layer_count() + AdapterLayer.count_layer() + + + + def _get_device(self): + if self.instantiated: + return self.modulelist.down_proj.weight.device + else: + return self.init_device + + def instantiate(self, hiddens): + self.hidden_dim = hiddens.shape[-1] + self.hidden_dtype = hiddens.dtype + self.modulelist = nn.Sequential() + self.modulelist.add_module("down_proj",nn.Linear(self.hidden_dim, self.bottleneck_dim, device=self.init_device, dtype=self.hidden_dtype)) + + # select non-linearity + self.modulelist.add_module("non_linear", Activations(self.non_linearity.lower())) + + self.modulelist.add_module("up_proj", nn.Linear(self.bottleneck_dim, self.hidden_dim, device=self.init_device, dtype=self.hidden_dtype)) + + # TODO: + # If we want to have a layer norm on output, we apply it later after a separate residual connection + # This means that we learn a new output layer norm, which replaces another layer norm learned in the bert layer + # if self.add_layer_norm_after: + # self.adapter_norm_after = nn.LayerNorm(self.input_size) + + self.instantiated = True + # initialize the weight, which is important for fast convergence and better performance. + self.apply(self._init_weight) + if self.backend == 'bmt': + import bmtrain as bmt + self.modulelist = bmt.BMTrainModelWrapper(self.modulelist) + + def _init_weight(self, module): + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=0.01) + if module.bias is not None: + module.bias.data.zero_() + + + def post_forward(self, output): + r""" Get the hidden_states from the PLM's layer output, pass it into the adapter, + then combined with the main hidden_states. Finally pass it into the subsequent layer. + + """ + if isinstance(output, tuple): + hiddens = output[0] + elif isinstance(output, torch.Tensor): + hiddens = output + else: + raise TypeError + + hiddens = self._transpose(hiddens) + # if self.backend == 'hf': + # hiddens = self._convert_data_type(hiddens) + # elif self.backend == 'bmt': # if bmt, left the convertion to bmt + # pass + + if not self.instantiated: + # self.hidden_dim = hiddens.shape[-1] + # logger.debug(f"Got hidden dim hidden_dim {self.hidden_dim}") + self.instantiate(hiddens=hiddens) + + # from IPython import embed; embed(header="14135315") + adapter_output = self.modulelist(hiddens) + modified_output = adapter_output + hiddens # TODO option: disable residual_connection + + modified_output = self._reverse_transpose(modified_output) + + # if self.backend == 'hf': + # # print("!"*100) + # modified_output = self._reverse_data_type(modified_output) + # elif self.backend == 'bmt': # if bmt, left the convertion to bmt + # print("!"*100) + # pass + + + if isinstance(output, tuple): + output = (modified_output,) + output[1:] + elif isinstance(output, torch.Tensor): + output = modified_output + else: + raise TypeError + return output + + + +class AdapterConfig(BaseDeltaConfig): + r""" + This is the configuration class to store the configuration of a :py:class:`~AdapterModel` + + """ + def __init__( + self, + bottleneck_dim: Optional[int]=24, + non_linearity: Optional[str]='gelu_new', + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + + + +class AdapterModel(DeltaBase): + r""" The implementation of Adapter(`Parameter-Efficient Transfer Learning for NLP `_ ) . + Add adapter to the designated ``modified_modules``. In sequential paradigm, The modules' output is then passed into the adapter's + post_forward. + + .. note:: + We **assume** the output of the modified module is the hidden state or a tuple where hidden state is the + first element. This is true for most PLMs. However, we admit that currently it's not rigorous, We will improve + it in the next version. Currently, if you encount an error here for you backbone, you can modify the code to + get the hidden state. + + class attributes: + - default_modified_modules = ["attn", "ff"] According to the Adapter paper, we add adapter to the attention layer + and feed forward layer. + - delta_type = "adapter" + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + bottleneck_dim (:obj:`int`): The dimension of the adapter's bottleneck. + non_linearity (:obj:`str`): The non linearity of the adapter. + modified_modules (:obj:`List[str]`): modules to add adapter after them. + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen together with the adapter parameters. + common_structure (:obj:`bool`): whether using name-based addressing witha common structure mapping. + backend (:obj:`str`): choose the backend of plm, 'hf' for huggingface transformers,'bmt' for bmtrain. + + """ + config_class = AdapterConfig + delta_type = "adapter" + default_modified_modules = ["attn@.proj@", "ff@.w2@"] + _supported_backends = ['hf', 'bmt'] + _need_pseudo_data = True + def __init__(self, + backbone_model: nn.Module, + bottleneck_dim: Optional[int]=24, + non_linearity: Optional[str]='gelu_new', + modified_modules: Optional[List[str]] = None, + exclude_modules: Optional[List[str]] = None, + unfrozen_modules: Optional[bool] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + backend: Optional[str] = 'hf', + ): + DeltaBase.__init__(self, + backbone_model, + modified_modules=modified_modules, + exclude_modules=exclude_modules, + unfrozen_modules=unfrozen_modules, + common_structure=common_structure, + interactive_modify=interactive_modify, + backend=backend, + ) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + self.delta_modules = nn.ModuleList() + + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + + def update_module(self, module: nn.Module, key: str): + _, _, ref = self.find_module(module, key) + adapterlayer = self.new_module_like(ref) + self.insert_sequential_module(ref, delta_module=adapterlayer, delta_name="adapter") + + def new_module_like(self, module): + module_device = get_device(module) + adapterlayer = AdapterLayer(bottleneck_dim=self.bottleneck_dim, non_linearity=self.non_linearity, device=module_device, backend=self.backend) + self.delta_modules.append(adapterlayer) + return adapterlayer diff --git a/OpenDelta-0.3.2/opendelta/delta_models/bitfit.py b/OpenDelta-0.3.2/opendelta/delta_models/bitfit.py new file mode 100644 index 0000000..221d84d --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/delta_models/bitfit.py @@ -0,0 +1,209 @@ +from typing import Optional, Union +from opendelta.utils.signature import get_arg_names_inside_func +from opendelta.utils.name_based_addressing import * +from opendelta.basemodel import DeltaBase, is_leaf_module +from opendelta.utils.cuda import get_device, get_dtype +import torch.nn as nn + +import torch +from torch.nn import init +import math +from opendelta import BaseDeltaConfig +import opendelta.utils.logging as logging +logger = logging.get_logger(__name__) + + +class BitFitConfig(BaseDeltaConfig): + r""" + This is the configuration class to store the configuration of a :py:class:`~BitFitModel` + + """ + def __init__( + self, + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + +class BiasLayer(nn.Module): + def __init__(self, init_method="zero", dtype=None, device=None, backend=None): + super().__init__() + self.init_method=init_method + self.instantiated = False + self.dtype = dtype + self.device = device + self.backend = backend + + def instantiate(self, hidden_dim): + if self.init_method == "zero": + self.bias = nn.Parameter(torch.zeros(hidden_dim, dtype=self.dtype, device=self.device)) + else: + raise NotImplementedError + self.instantiated = True + if self.backend == 'bmt': + import bmtrain as bmt + self.bias = bmt.BMTrainModelWrapper(self.bias) + + def post_forward(self, output): + r"""Presuming the first argument is the tensor to add bias along the last dimension. + In most cases, it is correct. However, be aware of the possibility that the presumption + doesn't hold. + """ + if isinstance(output, tuple): + hiddens = output[0] + elif isinstance(output, torch.Tensor): + hiddens = output + else: + raise TypeError + + if not self.instantiated: + self.hidden_dim = hiddens.shape[-1] + logger.debug(f"Got hidden dim hidden_dim {self.hidden_dim}") + self.instantiate(hidden_dim=self.hidden_dim) + + modified_output = hiddens + self.bias + + if isinstance(output, tuple): + output = (modified_output,) + output[1:] + elif isinstance(output, torch.Tensor): + output = modified_output + else: + raise TypeError + return output + + + +class BitFitModel(DeltaBase): + r""" The implementation of `BitFit: Simple Parameter-efficient Fine-tuning for Transformer-based Masked Language-models `_ . + Unfreeze bias term (or add bias term if bias term is absent in the backbone, e.g. T5) to the modules of + a transformer block. + + .. note:: + + **Broadcast to Submodule**: We modify all potential positions of the specified + ``modified_modules``. That is to say, if we specify ``attn`` in the modified_modules, then all position + including the q, k, v and out linear layer of the attention layer are added bias layer (or unfreezing). + The potential position is determined according to equation (1)-(5) and the previous three + equations. + + + class attributes: + - default_modified_modules = ["attn", "ff", "layer_norm","lm_head.proj"] According to the paper and the + implementation in `Compacter's baseline `_ , we modify the + bias term in the above modules. + - delta_type = "bitfit" + + + + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only + the implemented ones) + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen + together with the prefix parameters. + common_structure (:obj:`bool`): whether using name-based addressing with a common structure mapping. + + """ + + + config_class = BitFitConfig + delta_type = "bitfit" + default_modified_modules = ["attn@", "ff@", "layer_norm@","lm_head@.proj@"] # modify all the bias parameter in attention and feed-forward layer. + _supported_backends = ['hf', 'bmt'] + _need_pseudo_data = False + def __init__(self, + backbone_model: nn.Module, + modified_modules: Optional[List[str]] = None, + exclude_modules: Optional[List[str]] = None, + unfrozen_modules: Optional[List[str]] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + backend: Optional[str] = "hf", + ): + DeltaBase.__init__(self, + backbone_model, + modified_modules=modified_modules, + exclude_modules=exclude_modules, + unfrozen_modules=unfrozen_modules, + common_structure=common_structure, + interactive_modify=interactive_modify, + backend=backend, + ) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + self.delta_params = nn.ParameterList() + self.delta_modules = nn.ModuleList() + + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules) + + + + + def update_module(self, module: nn.Module, key: str): + _, _, ref = self.find_module(module, key) + self.modify_module(ref) + + + def modify_module(self, + module: nn.Module, + ): + if is_leaf_module(module): + if self.backend_mapping.check_type(module, 'linear') or \ + self.backend_mapping.check_type(module, 'layer_norm'): + self.add_bias_to_modules_have_bias_or_known_type(module) + else: + self.add_bias_to_others(module) + else: + for n, c in module.named_modules(): + self.add_bias_to_modules_have_bias_or_known_type(c) + + def add_bias_to_modules_have_bias_or_known_type(self, c): + '''If it has bias, unfreeze it. + If it doesn't have bias: if it is Linear of LN, add to it, else pass. + ''' + if 'bias' in [n for n,p in c.named_parameters()]: + c.bias.requires_grad = True + self.delta_params.append(c.bias) + else: + if self.backend_mapping.check_type(c, 'linear') or \ + self.backend_mapping.check_type(c, 'layer_norm'): + bias = nn.Parameter(torch.empty(c.out_features), requires_grad=True) + + self._reset_bias_parameters(c, bias) + if self.backend == 'bmt': + import bmtrain as bmt + bias = bmt.BMTrainModelWrapper(bias) + + c.register_parameter('bias', bias) + self.delta_params.append(bias) + + def add_bias_to_others(self, c): + new_bias = BiasLayer(dtype=get_dtype(c), device=get_device(c), backend=self.backend) + + self.insert_sequential_module(c, delta_module=new_bias, delta_name="bitfit") # name shouldn't be `bias` here, since the name `bias` is reserved for some module such as roberta's LayerNorm. + self.delta_modules.append(new_bias) + + @staticmethod + def _reset_bias_parameters(linear_module, bias): + fan_in, _ = init._calculate_fan_in_and_fan_out(linear_module.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(bias, -bound, bound) + # init.uniform_(bias, -bound, bound) + + def detach(self, module): + r"""Not implemented for BitFit yet. Please wait for the next version. + """ + raise NotImplementedError + + def attach(self, module): + r"""Not implemented for BitFit yet. Please wait for the next version. + """ + raise NotImplementedError diff --git a/OpenDelta-0.3.2/opendelta/delta_models/compacter.py b/OpenDelta-0.3.2/opendelta/delta_models/compacter.py new file mode 100644 index 0000000..9743aca --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/delta_models/compacter.py @@ -0,0 +1,298 @@ +from functools import partial +from typing import Optional, Union +from opendelta.delta_configs import BaseDeltaConfig +from opendelta.utils.signature import get_arg_names_inside_func +from opendelta.utils.name_based_addressing import * +from opendelta.utils.cuda import get_device +from opendelta.basemodel import DeltaBase +import torch.nn as nn +import torch +from opendelta.delta_models.layers.activations import Activations +import inspect +from opendelta.delta_models.layers.hypercomplex_linear import PHMLinear +import opendelta.utils.logging as logging +logger = logging.get_logger(__name__) + +class HyperComplexAdapterLayer(nn.Module): + """Hypercomplex Adapter layer, in which the weights of up and down sampler modules + are parameters are 1/n times of the conventional adapter layers, where n is + hypercomplex division number.""" + + def __init__(self, + reduction_factor=16, + non_linearity="relu", + phm_c_init="normal", + hypercomplex_division=4, + learn_phm=True, + hypercomplex_nonlinearity="glorot-uniform", + shared_phm_rule=False, + factorized_phm=True, + phm_rule: Optional[torch.Tensor]=None, + shared_W_phm=False, + factorized_phm_rule=False, + phm_rank=1, + phm_init_range=0.0001, + kronecker_prod=None, + device=None, + use_bias_up_sampler=True, + use_bias_down_sampler=True, + backend = 'hf', + ): + super().__init__() + self.reduction_factor = reduction_factor + self.non_linearity = non_linearity + self.phm_c_init = phm_c_init + self.hypercomplex_division = hypercomplex_division + self.learn_phm = learn_phm + self.phm_rule=phm_rule + self.hypercomplex_nonlinearity = hypercomplex_nonlinearity + self.shared_phm_rule = shared_phm_rule + self.factorized_phm = factorized_phm + self.shared_W_phm = shared_W_phm + self.factorized_phm_rule = factorized_phm_rule + self.phm_rank = phm_rank + self.phm_init_range = phm_init_range + self.kronecker_prod = kronecker_prod + self.use_bias_up_sampler=use_bias_up_sampler + self.use_bias_down_sampler=use_bias_down_sampler + self.device = device + self.backend = backend + + self.instantiated = False + + + def instantiate(self, hiddens): + self.hidden_dim = hiddens.shape[-1] + self.hidden_dtype = hiddens.dtype + self.down_sample_size = self.hidden_dim // self.reduction_factor + self.activation = Activations(self.non_linearity.lower()).to(self.device) + self.down_sampler = PHMLinear(in_features=self.hidden_dim, + out_features=self.down_sample_size, + bias=self.use_bias_down_sampler, + c_init=self.phm_c_init, + phm_dim=self.hypercomplex_division, + phm_rule=self.phm_rule, + learn_phm=self.learn_phm, + w_init=self.hypercomplex_nonlinearity, + shared_phm_rule=self.shared_phm_rule, + factorized_phm=self.factorized_phm, + shared_W_phm=self.shared_W_phm, + factorized_phm_rule=self.factorized_phm_rule, + phm_rank=self.phm_rank, + phm_init_range=self.phm_init_range, + kronecker_prod=self.kronecker_prod, + dtype = self.hidden_dtype).to(self.device) + self.up_sampler = PHMLinear(in_features=self.down_sample_size, + out_features=self.hidden_dim, + bias=self.use_bias_up_sampler, + c_init=self.phm_c_init, + phm_dim=self.hypercomplex_division, + phm_rule=self.phm_rule, + learn_phm=self.learn_phm, + w_init=self.hypercomplex_nonlinearity, + shared_phm_rule=self.shared_phm_rule, + factorized_phm=self.factorized_phm, + shared_W_phm=self.shared_W_phm, + factorized_phm_rule=self.factorized_phm_rule, + phm_rank=self.phm_rank, + phm_init_range=self.phm_init_range, + kronecker_prod=self.kronecker_prod, + dtype = self.hidden_dtype).to(self.device) + self.instantiated = True + if self.backend == "bmt": + import bmtrain as bmt + self.activation = bmt.BMTrainModelWrapper(self.activation) + self.down_sampler = bmt.BMTrainModelWrapper(self.down_sampler) + self.up_sampler = bmt.BMTrainModelWrapper(self.up_sampler) + + + def post_forward(self, output): + r""" Get the hidden_states from the PLM's layer output, pass it into the hypercomplex adapter, + then combined with the main hidden_states. Finally pass it into the subsequent layer. + + """ + + if isinstance(output, tuple): + hiddens = output[0] + elif isinstance(output, torch.Tensor): + hiddens = output + else: + raise TypeError + + if not self.instantiated: + self.instantiate(hiddens=hiddens) + + + z = self.down_sampler(hiddens) + z = self.activation(z) + adapter_output = self.up_sampler(z) + + modified_output = adapter_output + hiddens # residual_connection + if isinstance(output, tuple): + output = (modified_output,) + output[1:] + elif isinstance(output, torch.Tensor): + output = modified_output + else: + raise TypeError + return output + +class CompacterConfig(BaseDeltaConfig): + r""" + This is the configuration class to store the configuration of a :py:class:`~CompacterModel` + + """ + def __init__( + self, + bottleneck_dim: Optional[int]=32, + non_linearity: Optional[str]='relu', + sequential: Optional[str] = True, + reduction_factor=16, + phm_c_init="normal", + hypercomplex_division=4, + learn_phm=True, + hypercomplex_nonlinearity="glorot-uniform", + shared_phm_rule=False, + factorized_phm=True, + shared_W_phm=False, + factorized_phm_rule=False, + phm_rank=1, + phm_init_range=0.0001, + kronecker_prod=None, + use_bias_up_sampler=True, + use_bias_down_sampler=True, + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + + + +class CompacterModel(DeltaBase): + r""" The implementation of `Compacter: Efficient Low-Rank Hypercomplex Adapter Layers `_ . + Add compacter layer to the designated ``modified_modules``. In sequential paradigm, The modules' output is then + passed into the compacter's post_forward. + + .. note:: + We **assume** the output of the modified module is the hidden state or a tuple where hidden state is the + first element. This is true for most PLMs. However, we admit that currently it's not rigorous, We will improve + it in the next version. Currently, if you encount an error here for you backbone, you can modify the code to + get the hidden state. + + All the hyperparameter is adopted from the `compacter code base `_ . + + class attributes: + - default_modified_modules = ["attn", "ff"] According to the compacter paper, we add compacter to the attention layer + and feed forward layer. + - delta_type = "compacter" + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only + the implemented ones) + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen + together with the prefix parameters. + common_structure (:obj:`bool`, *optional*, default to :obj:`None`): whether using name-based addressing with a common structure mapping. + backend (:obj:`str`): choose the backend of plm, 'hf' for huggingface transformers,'bmt' for bmtrain + reduction_factor (:obj:`int`, *optional*, default to ``16``): bottleneck_dim = hidden_dim//reduction_factor + non_linearity (:obj:`str`, *optional*, default to ``"gelu_new"``): The non linearity activation used in between the down + projecter and the up projecter. + phm_c_init (:obj:`str`, *optional*, default to ``"normal"``): The initialize method of the C in compacter. + hypercomplex_division (:obj:`str`, *optional*, default to 4): The ``n`` in the paper. The number of division along a dimension in compector. + learn_phm (:obj:`bool`, *optional*, default to :obj:`True` ): Whether the phm rule requires_grad. Note that we didn't check the performance of learn_phm=False. + hypercomplex_nonlinearity (:obj:`str`, *optional*, default to ``"glorot-uniform"``): The initialize method of the W in compacter. + shared_phm_rule (:obj:`str`, *optional* , default to :obj:`False`): Whether the phm rule is shared accross layer. + factorized_phm (:obj:`str`, *optional*, default to :obj:`True`): Whether to factorize the phm into low rank product. + shared_W_phm (:obj:`str`, *optional* , default to :obj:`False`): Whether the W_phm is shared accross layer. + factorized_phm_rule (:obj:`str`, *optional* , default to :obj:`False`): Whether to factorize the phm rule into low rank product. + phm_rank=1 (:obj:`int`, *optional*, default to 1): The rank of low rank decomposition of phm. + phm_init_range (:obj:`float`, *optional*, default to 0.0001): The range of phm initialization. + kronecker_prod (:obj:`bool`, *optional*, default to False): Whether to perform kronecker_prod in matvec_product, proposed by + `Parameterization of Hypercomplex Multiplications `_ + use_bias_up_sampler (:obj:`float`, *optional*, default to :obj:`True`): Whether add bias to the up projector. + Note that the bias for this is a ``hidden_dim`` vector. + use_bias_down_sampler (:obj:`float`, *optional*, default to :obj:`True`): Whether add bias to the down projector. + Note that the bias for this is a ``bottleneck_dim`` vector. + + + """ + config_class = CompacterConfig + delta_type = "compacter" + default_modified_modules = ["attn@.proj@", "ff@.w2@"] + _supported_backends = ['hf', 'bmt'] + _need_pseudo_data = True + def __init__(self, + backbone_model, + modified_modules: Optional[List[str]] = None, + exclude_modules: Optional[List[str]] = None, + unfrozen_modules: Optional[List[str]] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + backend: Optional[str] = 'hf', + reduction_factor=16, + non_linearity="gelu_new", + phm_c_init="normal", + hypercomplex_division=4, + learn_phm=True, + hypercomplex_nonlinearity="glorot-uniform", + shared_phm_rule=False, + factorized_phm=True, + shared_W_phm=False, + factorized_phm_rule=False, + phm_rank=1, + phm_init_range=0.0001, + kronecker_prod=None, + use_bias_up_sampler=True, + use_bias_down_sampler=True, + ): + DeltaBase.__init__(self, + backbone_model, + modified_modules=modified_modules, + exclude_modules=exclude_modules, + unfrozen_modules=unfrozen_modules, + common_structure=common_structure, + interactive_modify=interactive_modify, + ) + assert shared_phm_rule == False, "In opendelta version {opendelta.__version__}, "\ + "shared_phm_rule is not supported. Later, sharing parameters will be tackled using"\ + "a unified paradigm." + assert shared_W_phm == False, "In opendelta version {opendelta.__version__}, "\ + "shared_W_phm is not supported. Later, sharing parameters will be tackled using"\ + "a unified paradigm." + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + self.delta_modules = nn.ModuleList() + + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + + # def add_all_delta_to_backbone(self, + # module: nn.Module, + # modified_modules: List[str], + # ) -> nn.Module: + # for key, _ in module.named_modules(): + # if self.find_key(key, modified_modules): + # self.update_module(module, key) + # self._pseudo_data_to_instantiate(module) + # self.mark_as_delta() + # return module + + def update_module(self, module: nn.Module, key: str): + _, _, ref = self.find_module(module, key) + adapterlayer = self.new_module_like(ref) + self.insert_sequential_module(ref, + delta_module=adapterlayer, + delta_name="compactor") + + def new_module_like(self, module): + module_device = get_device(module) + adapterlayer = HyperComplexAdapterLayer(reduction_factor=self.reduction_factor, non_linearity=self.non_linearity, phm_c_init=self.phm_c_init, hypercomplex_division=self.hypercomplex_division, learn_phm=self.learn_phm, hypercomplex_nonlinearity=self.hypercomplex_nonlinearity, shared_phm_rule=self.shared_phm_rule, factorized_phm=self.factorized_phm, shared_W_phm=self.shared_W_phm, factorized_phm_rule=self.factorized_phm_rule, phm_rank=self.phm_rank, phm_init_range=self.phm_init_range, kronecker_prod=self.kronecker_prod, use_bias_up_sampler=self.use_bias_up_sampler, use_bias_down_sampler=self.use_bias_down_sampler, device=module_device, backend=self.backend) + self.delta_modules.append(adapterlayer) + return adapterlayer diff --git a/OpenDelta-0.3.2/opendelta/delta_models/layers/__init__.py b/OpenDelta-0.3.2/opendelta/delta_models/layers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/OpenDelta-0.3.2/opendelta/delta_models/layers/activations.py b/OpenDelta-0.3.2/opendelta/delta_models/layers/activations.py new file mode 100644 index 0000000..8ce4a16 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/delta_models/layers/activations.py @@ -0,0 +1,50 @@ +import torch +import math +import torch.nn as nn + +import torch.nn as nn +from transformers.activations import get_activation + +class Activations(nn.Module): + """ + Implementation of various activation function. Copied from open-source project AdapterHub #TODO: addlink + """ + + def __init__(self, activation_type): + self.activation_type = activation_type + if activation_type.lower() == "relu": + self.f = nn.functional.relu + elif activation_type.lower() == "tanh": + self.f = torch.tanh + elif activation_type.lower() == "swish": + + def swish(x): + return x * torch.sigmoid(x) + + self.f = swish + elif activation_type.lower() == "gelu_new": + + def gelu_new(x): + """ + Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). + Also see https://arxiv.org/abs/1606.08415 + """ + return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + + self.f = gelu_new + elif activation_type.lower() == "gelu_orig": + self.f = nn.functional.gelu + elif activation_type.lower() == "leakyrelu": + self.f = nn.functional.leaky_relu + else: + self.f = get_activation(activation_type) + + super().__init__() + + def forward(self, x): + return self.f(x) + + def __repr__(self): + return self.activation_type + + diff --git a/OpenDelta-0.3.2/opendelta/delta_models/layers/hypercomplex_linear.py b/OpenDelta-0.3.2/opendelta/delta_models/layers/hypercomplex_linear.py new file mode 100644 index 0000000..e4f94c2 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/delta_models/layers/hypercomplex_linear.py @@ -0,0 +1,214 @@ +# The codes are from https://github.com/bayer-science-for-a-better-life/phc-gnn +import torch +import torch.nn as nn +from typing import Union, Optional +import torch.nn.functional as F +import torch +import math +from opendelta.delta_models.layers.init import glorot_uniform, glorot_normal + + + +# The codes are from https://github.com/bayer-science-for-a-better-life/phc-gnn + +"""A part of the pylabyk library: numpytorch.py at https://github.com/yulkang/pylabyk""" +def kronecker_product(a, b): + """ + Kronecker product of matrices a and b with leading batch dimensions. + Batch dimensions are broadcast. The number of them mush + :type a: torch.Tensor + :type b: torch.Tensor + :rtype: torch.Tensor + """ + #return torch.stack([torch.kron(ai, bi) for ai, bi in zip(a,b)], dim=0) + siz1 = torch.Size(torch.tensor(a.shape[-2:]) * torch.tensor(b.shape[-2:])) + res = a.unsqueeze(-1).unsqueeze(-3) * b.unsqueeze(-2).unsqueeze(-4) + siz0 = res.shape[:-4] + out = res.reshape(siz0 + siz1) + return out + + +def kronecker_product_einsum_batched(A: torch.Tensor, B: torch.Tensor): + """ + Batched Version of Kronecker Products + :param A: has shape (b, a, c) + :param B: has shape (b, k, p) + :return: (b, ak, cp) + """ + assert A.dim() == 3 and B.dim() == 3 + res = torch.einsum('bac,bkp->bakcp', A, B).view(A.size(0), + A.size(1)*B.size(1), + A.size(2)*B.size(2)) + return res + + + +def matvec_product(W: torch.Tensor, x: torch.Tensor, + bias: Optional[torch.Tensor], + phm_rule, #: Union[torch.Tensor], + kronecker_prod=False) -> torch.Tensor: + """ + Functional method to compute the generalized matrix-vector product based on the paper + "Parameterization of Hypercomplex Multiplications (2020)" + https://openreview.net/forum?id=rcQdycl0zyk + y = Hx + b , where W is generated through the sum of kronecker products from the Parameterlist W, i.e. + W is a an order-3 tensor of size (phm_dim, in_features, out_features) + x has shape (batch_size, phm_dim*in_features) + phm_rule is an order-3 tensor of shape (phm_dim, phm_dim, phm_dim) + H = sum_{i=0}^{d} mul_rule \otimes W[i], where \otimes is the kronecker product + """ + if kronecker_prod: + H = kronecker_product(phm_rule, W).sum(0) + else: + H = kronecker_product_einsum_batched(phm_rule, W).sum(0) + + y = torch.matmul(input=x.to(H.dtype), other=H).to(x.dtype) + if bias is not None: + y += bias + return y + + +class PHMLinear(torch.nn.Module): + def __init__(self, + in_features: int, + out_features: int, + phm_dim: int, + phm_rule: Union[None, torch.Tensor] = None, + bias: bool = True, + w_init: str = "phm", + c_init: str = "random", + learn_phm: bool = True, + shared_phm_rule=False, + factorized_phm=False, + shared_W_phm=False, + factorized_phm_rule=False, + phm_rank = 1, + phm_init_range=0.0001, + kronecker_prod=False, + dtype=torch.float) -> None: + super(PHMLinear, self).__init__() + assert w_init in ["phm", "glorot-normal", "glorot-uniform", "normal"] + assert c_init in ["normal", "uniform"] + assert in_features % phm_dim == 0, f"Argument `in_features`={in_features} is not divisble be `phm_dim`{phm_dim}" + assert out_features % phm_dim == 0, f"Argument `out_features`={out_features} is not divisble be `phm_dim`{phm_dim}" + self.in_features = in_features + self.out_features = out_features + self.learn_phm = learn_phm + self.phm_dim = phm_dim + self._in_feats_per_axis = in_features // phm_dim + self._out_feats_per_axis = out_features // phm_dim + self.phm_rank = phm_rank + self.phm_rule = phm_rule + self.phm_init_range = phm_init_range + self.kronecker_prod=kronecker_prod + self.shared_phm_rule = shared_phm_rule + self.factorized_phm_rule = factorized_phm_rule + if not self.shared_phm_rule: + if self.factorized_phm_rule: + self.phm_rule_left = nn.Parameter(torch.empty((phm_dim, phm_dim, 1), dtype=dtype), + requires_grad=learn_phm) + self.phm_rule_right = nn.Parameter(torch.empty((phm_dim, 1, phm_dim), dtype=dtype), + requires_grad=learn_phm) + else: + self.phm_rule = nn.Parameter(torch.empty((phm_dim, phm_dim, phm_dim), dtype=dtype), + requires_grad=learn_phm) + self.bias_flag = bias + self.w_init = w_init + self.c_init = c_init + self.shared_W_phm = shared_W_phm + self.factorized_phm = factorized_phm + if not self.shared_W_phm: + if self.factorized_phm: + self.W_left = nn.Parameter(torch.empty((phm_dim, self._in_feats_per_axis, self.phm_rank), dtype=dtype), + requires_grad=True) + self.W_right = nn.Parameter(torch.empty((phm_dim, self.phm_rank, self._out_feats_per_axis), dtype=dtype), + requires_grad=True) + else: + self.W = nn.Parameter(torch.empty((phm_dim, self._in_feats_per_axis, self._out_feats_per_axis), dtype=dtype), + requires_grad=True) + if self.bias_flag: + self.b = nn.Parameter(torch.empty(out_features, dtype=dtype), requires_grad=True) + else: + self.register_parameter("b", None) + self.reset_parameters() + + def init_W(self): + if self.w_init == "glorot-normal": + if self.factorized_phm: + for i in range(self.phm_dim): + self.W_left.data[i] = glorot_normal(self.W_left.data[i]) + self.W_right.data[i] = glorot_normal(self.W_right.data[i]) + else: + for i in range(self.phm_dim): + self.W.data[i] = glorot_normal(self.W.data[i]) + elif self.w_init == "glorot-uniform": + if self.factorized_phm: + for i in range(self.phm_dim): + self.W_left.data[i] = glorot_uniform(self.W_left.data[i]) + self.W_right.data[i] = glorot_uniform(self.W_right.data[i]) + else: + for i in range(self.phm_dim): + self.W.data[i] = glorot_uniform(self.W.data[i]) + elif self.w_init == "normal": + if self.factorized_phm: + for i in range(self.phm_dim): + self.W_left.data[i].normal_(mean=0, std=self.phm_init_range) + self.W_right.data[i].normal_(mean=0, std=self.phm_init_range) + else: + for i in range(self.phm_dim): + self.W.data[i].normal_(mean=0, std=self.phm_init_range) + else: + raise ValueError + + def reset_parameters(self): + if not self.shared_W_phm: + self.init_W() + + if self.bias_flag: + self.b.data = torch.zeros_like(self.b.data) + + if not self.shared_phm_rule: + if self.factorized_phm_rule: + if self.c_init == "uniform": + self.phm_rule_left.data.uniform_(-0.01, 0.01) + self.phm_rule_right.data.uniform_(-0.01, 0.01) + elif self.c_init == "normal": + self.phm_rule_left.data.normal_(std=0.01) + self.phm_rule_right.data.normal_(std=0.01) + else: + raise NotImplementedError + else: + if self.c_init == "uniform": + self.phm_rule.data.uniform_(-0.01, 0.01) + elif self.c_init == "normal": + self.phm_rule.data.normal_(mean=0, std=0.01) + else: + raise NotImplementedError + + def set_phm_rule(self, phm_rule=None, phm_rule_left=None, phm_rule_right=None): + """If factorized_phm_rules is set, phm_rule is a tuple, showing the left and right + phm rules, and if this is not set, this is showing the phm_rule.""" + if self.factorized_phm_rule: + self.phm_rule_left = phm_rule_left + self.phm_rule_right = phm_rule_right + else: + self.phm_rule = phm_rule + + def set_W(self, W=None, W_left=None, W_right=None): + if self.factorized_phm: + self.W_left = W_left + self.W_right = W_right + else: + self.W = W + + def forward(self, x: torch.Tensor, phm_rule: Union[None, nn.ParameterList] = None) -> torch.Tensor: + if self.factorized_phm: + W = torch.bmm(self.W_left, self.W_right) + if self.factorized_phm_rule: + phm_rule = torch.bmm(self.phm_rule_left, self.phm_rule_right) + return matvec_product( + W=W if self.factorized_phm else self.W, + x=x, + bias=self.b, + phm_rule=phm_rule if self.factorized_phm_rule else self.phm_rule, + kronecker_prod=self.kronecker_prod) \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/delta_models/layers/init.py b/OpenDelta-0.3.2/opendelta/delta_models/layers/init.py new file mode 100644 index 0000000..98a03e3 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/delta_models/layers/init.py @@ -0,0 +1,8 @@ +import torch +import math + +def glorot_normal(tensor: torch.Tensor): + return torch.nn.init.xavier_normal_(tensor, gain=math.sqrt(2)) + +def glorot_uniform(tensor: torch.Tensor): + return torch.nn.init.xavier_uniform_(tensor, gain=math.sqrt(2)) diff --git a/OpenDelta-0.3.2/opendelta/delta_models/layers/low_rank_linear.py b/OpenDelta-0.3.2/opendelta/delta_models/layers/low_rank_linear.py new file mode 100644 index 0000000..95c1466 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/delta_models/layers/low_rank_linear.py @@ -0,0 +1,39 @@ +"""This script implements a low-rank linear layer.""" +import torch +import torch.nn as nn + +from opendelta.delta_models.layers.init import glorot_uniform, glorot_normal + +class LowRankLinear(torch.nn.Module): + def __init__(self, input_dim: int, output_dim: int, rank: int = 1, + bias: bool = True, w_init: str = "glorot-uniform", dtype=torch.float): + super(LowRankLinear, self).__init__() + self.input_dim = input_dim + self.output_dim = output_dim + self.rank = rank + self.bias = bias + self.w_init = w_init + self.W_left = nn.Parameter(torch.empty((input_dim, rank), dtype=dtype),requires_grad=True) + self.W_right = nn.Parameter(torch.empty((rank, output_dim), dtype=dtype), requires_grad=True) + if bias: + self.b = nn.Parameter(torch.empty(output_dim, dtype=dtype)) + self.reset_parameters() + + def reset_parameters(self): + if self.bias: + self.b.data = torch.zeros_like(self.b.data) + if self.w_init == "glorot-uniform": + self.W_left.data = glorot_uniform(self.W_left.data) + self.W_right.data = glorot_uniform(self.W_right.data) + elif self.w_init == "glorot-normal": + self.W_left.data = glorot_normal(self.W_left.data) + self.W_right.data = glorot_normal(self.W_right.data) + else: + raise ValueError + + def forward(self, x: torch.Tensor) -> torch.Tensor: + W = self.W_left*self.W_right + output = torch.matmul(input=x.to(W.dtype), other=W).to(x.dtype) + if self.bias: + output += self.b + return output diff --git a/OpenDelta-0.3.2/opendelta/delta_models/lora.py b/OpenDelta-0.3.2/opendelta/delta_models/lora.py new file mode 100644 index 0000000..0446842 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/delta_models/lora.py @@ -0,0 +1,163 @@ +from typing import Optional, Union + +from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func +from opendelta.utils.name_based_addressing import * +from opendelta.basemodel import DeltaBase +import torch.nn as nn +from opendelta import BaseDeltaConfig +import math +from dataclasses import dataclass, field + +class LowRankLinear(nn.Module): + # ------------------------------------------------------------------------------------------ + # Copyright (c) Microsoft Corporation. All rights reserved. + # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. + # ------------------------------------------------------------------------------------------ + # copy from loralib and do some refactor + def __init__(self, + in_features, + out_features, + weight, + r=8, + lora_alpha=16, + lora_dropout=0.0, + ): + super().__init__() + self.r = r + self.lora_alpha = lora_alpha + self.lora_dropout = lora_dropout + if lora_dropout > 0.: + self.lora_dropout = nn.Dropout(p=lora_dropout) + else: + self.lora_dropout = lambda x: x + if r > 0: + self.lora_A = nn.Parameter(weight.new_zeros((r, in_features))) + self.lora_B = nn.Parameter(weight.new_zeros((out_features, r))) + self.scaling = self.lora_alpha / self.r + nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5)) + nn.init.zeros_(self.lora_B) + + def forward(self, x): + return (self.lora_dropout(x) @ self.lora_A.T @ self.lora_B.T) * self.scaling + +@dataclass +class LoraArguments: + r: int = 8 + lora_alpha: int = 16 + lora_dropout: float = 0.0 + +class LoraConfig(BaseDeltaConfig): + r""" + This is the configuration class to store the configuration of a :py:class:`~LoraModel` + + """ + def __init__( + self, + lora_r=8, + lora_alpha=16, + lora_dropout=0.0, + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + + +class LoraModel(DeltaBase): + r""" The implementation of `LoRA: Low-Rank Adaptation of Large Language Models `_ . + Thanks for their `loralib `_. + + .. note:: + + In our implementation, we did not use loralib.linear to replace the linear layer of the backbone model. + Instead, we insert a parallel module into the backbone. + In other words, we treat :math:`(W + A^TB) X` as :math:`WX+ A^TBX`, and insert the :math:`A^TBX` as a parallel insertion module. If you want to use the original implementation, please refer to `lora_old.py` + + class attributes: + + - default_modified_modules = ['attn.q', 'attn.v'] According to the paper, they modify q and v matrix in the attention layer. However, other linears can also be modified, and may lead to better performance. + + .. note:: + + modified_modules should point to linear layer. We currently don't support broadcast to all linears in + a module's child modules. + + - delta_type = "lora" + + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + lora_r (:obj:`int`, *optional*): the rank of the lora parameters. The smaller lora_r is , the fewer parameters lora has. + lora_alpha (:obj:`int`, *optional*): A hyper-parameter to control the init scale of loralib.linear . + lora_dropout (:obj:`float`, *optional*): The dropout rate in lora.linear. + modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only + the implemented ones) + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen + together with the prefix parameters. + common_structure (:obj:`bool`): whether using name-based addressing with a common structure mapping. + backend (:obj:`str`): choose the backend of plm, 'hf' for huggingface transformers,'bmt' for bmtrain + + """ + + config_class = LoraConfig + delta_type = "lora" + default_modified_modules = ['attn@.q@', 'attn@.v@'] + _supported_backends = ['hf', 'bmt'] + _need_pseudo_data = False + def __init__(self, + backbone_model: nn.Module, + lora_r=8, + lora_alpha=16, + lora_dropout=0.0, + modified_modules: Optional[List[str]] = None, + unfrozen_modules: Optional[List[str]] = None, + exclude_modules: Optional[List[str]] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + backend: Optional[str] = "hf", + ): + DeltaBase.__init__(self, + backbone_model, + modified_modules=modified_modules, + unfrozen_modules=unfrozen_modules, + common_structure=common_structure, + interactive_modify=interactive_modify, + backend=backend, + ) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + self.delta_modules = nn.ModuleList() + + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + + def update_module(self, module: nn.Module, key: str): + parent_ref, child_name, child_ref = self.find_module(module, key) + parallel_module = self.new_module_like(child_module=child_ref) + self.insert_parallel_module(child_ref, delta_module=parallel_module, delta_name="lora") + + def _pseudo_data_to_instantiate(self, module): + # no need to pass pseudo input, so overwrite it + pass + + def new_module_like(self, child_module): + in_features, out_features = child_module.in_features, child_module.out_features + new_module = LowRankLinear(in_features = in_features, + out_features = out_features, + weight = child_module.weight, + r=self.lora_r, + lora_alpha=self.lora_alpha, + lora_dropout=self.lora_dropout) + if self.backend == "bmt": + import bmtrain as bmt + new_module = bmt.BMTrainModelWrapper(new_module) + + self.delta_modules.append(new_module) + return new_module diff --git a/OpenDelta-0.3.2/opendelta/delta_models/low_rank_adapter.py b/OpenDelta-0.3.2/opendelta/delta_models/low_rank_adapter.py new file mode 100644 index 0000000..eaef90a --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/delta_models/low_rank_adapter.py @@ -0,0 +1,220 @@ + +from opendelta.basemodel import DeltaBase +from opendelta.delta_configs import BaseDeltaConfig +from opendelta.delta_models.layers.low_rank_linear import LowRankLinear +from opendelta.delta_models.layers.activations import Activations +from typing import Optional, Union +from opendelta.utils.signature import get_arg_names_inside_func +import torch.nn as nn +import torch +from typing import Optional +from opendelta.utils.name_based_addressing import * +from opendelta.utils.cuda import get_device +from opendelta.basemodel import DeltaBase +import torch.nn as nn +import torch +import math +import opendelta.utils.logging as logging +logger = logging.get_logger(__name__) + + +class LowRankAdapterConfig(BaseDeltaConfig): + r""" + This is the configuration class to store the configuration of a :py:class:`~LowRankAdapterModel` + + """ + def __init__( + self, + reduction_factor=32, + non_linearity="gelu_new", + low_rank_w_init="glorot-uniform", + low_rank_rank=1, + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + + + +class LowRankAdapter(nn.Module): + """This is the low-rank adapter, in which each adapter is composed of two rank-one matrices. + """ + def __init__(self, + reduction_factor=32, + non_linearity="gelu_new", + low_rank_w_init="glorot-uniform", + low_rank_rank=1, + device=None, + backend='hf'): + super().__init__() + self.reduction_factor = reduction_factor + self.non_linearity = non_linearity + self.low_rank_w_init = low_rank_w_init + self.low_rank_rank = low_rank_rank + self.device = device + self.instantiated = False + self.backend=backend + + + def instantiate(self, hiddens): + self.hidden_dim = hiddens.shape[-1] + self.hidden_dtype = hiddens.dtype + + self.down_sample_size = self.hidden_dim // self.reduction_factor + self.activation = Activations(self.non_linearity.lower()).to(self.device) + self.down_sampler = LowRankLinear(self.hidden_dim, self.down_sample_size, + w_init=self.low_rank_w_init, + rank=self.low_rank_rank, + dtype=self.hidden_dtype).to(self.device) + self.up_sampler = LowRankLinear(self.down_sample_size, self.hidden_dim, + w_init=self.low_rank_w_init, + rank=self.low_rank_rank, + dtype=self.hidden_dtype).to(self.device) + + self.instantiated = True + if self.backend == 'bmt': + import bmtrain as bmt + self.activation = bmt.BMTrainModelWrapper(self.activation) + self.down_sampler = bmt.BMTrainModelWrapper(self.down_sampler) + self.up_sampler = bmt.BMTrainModelWrapper(self.up_sampler) + + + def post_forward(self, output): + r""" Get the hidden_states from the PLM's layer output, pass it into the low-rank adapter, + then combined with the main hidden_states. Finally pass it into the subsequent layer. + + """ + + if isinstance(output, tuple): + hiddens = output[0] + elif isinstance(output, torch.Tensor): + hiddens = output + else: + raise TypeError + + if not self.instantiated: + self.instantiate(hiddens = hiddens) + + z = self.down_sampler(hiddens) + z = self.activation(z) + adapter_output = self.up_sampler(z) + + modified_output = adapter_output + hiddens # residual_connection + if isinstance(output, tuple): + output = (modified_output,) + output[1:] + elif isinstance(output, torch.Tensor): + output = modified_output + else: + raise TypeError + return output + + + + + + +class LowRankAdapterModel(DeltaBase): + r""" The implementation of LowRankAdapter, proposed as a baseline in + `Compacter: Efficient Low-Rank Hypercomplex Adapter Layers `_ . + We found that it enjoys very few parameters but competitive performance, thus add it into OpenDelta. + Low Rank Adapter parameterize each adapter’s weight as a product of two rank-one(low) weights. + + Add lowrank adapter layer to the designated ``modified_modules``. In sequential paradigm, The modules' output is then + passed into the low rank adapter's post_forward. + + .. note:: + We **assume** the output of the modified module is the hidden state or a tuple where hidden state is the + first element. This is true for most PLMs. However, we admit that currently it's not rigorous, We will improve + it in the next version. Currently, if you encount an error here for you backbone, you can modify the code to + get the hidden state. + + All the hyperparameter is adopted from the `compacter code base `_ . + + class attributes: + - default_modified_modules = ["attn", "ff"] According to the compacter paper, we add low rank adapter to the attention layer + and feed forward layer. + - delta_type = "lowrankadapter" + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + reduction_factor (:obj:`int`, *optional*, default to ``16``): bottleneck_dim = hidden_dim//reduction_factor + non_linearity (:obj:`str`, *optional*, default to ``"gelu_new"``): The non linearity activation used in between the down + projecter and the up projecter. + low_rank_w_init (:obj:`str`, *optional*, default to ``"glorot-uniform"``): The weight init method of the factorized + linear weight. + low_rank_rank (:obj:`int`, *optional*, default to 1): The rank of the low-rank decomposition. + modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only + the implemented ones) + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen + together with the prefix parameters. + common_structure (:obj:`bool`, *optional*, default to :obj:`None`): whether using name-based addressing with a common structure mapping. + + """ + + config_class = LowRankAdapterConfig + delta_type = "low_rank_adapter" + default_modified_modules = ["attn@.proj@", "ff@.w2@"] + _supported_backends = ['hf', 'bmt'] + _need_pseudo_data = True + def __init__(self, + backbone_model: nn.Module, + reduction_factor = 32, + non_linearity = "gelu_new", + low_rank_w_init = "glorot-uniform", + low_rank_rank = 1, + modified_modules: Optional[List[str]] = None, + exclude_modules: Optional[List[str]] = None, + unfrozen_modules: Optional[List[str]] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + backend: Optional[str] = 'hf', + ): + DeltaBase.__init__(self, + backbone_model, + modified_modules=modified_modules, + exclude_modules=exclude_modules, + unfrozen_modules=unfrozen_modules, + common_structure=common_structure, + interactive_modify=interactive_modify, + backend=backend, + ) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + self.delta_modules = nn.ModuleList() + + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + + # def add_all_delta_to_backbone(self, + # module: nn.Module, + # modified_modules: List[str], + # ) -> nn.Module: + # for key, _ in module.named_modules(): + # if self.find_key(key, modified_modules): + # self.update_module(module, key) + # self._pseudo_data_to_instantiate(module) + # self.mark_as_delta() + # return module + + def update_module(self, module: nn.Module, key: str): + _, _, ref = self.find_module(module, key) + adapterlayer = self.new_module_like(ref) + self.insert_sequential_module(ref, delta_module=adapterlayer, delta_name="low_rank_adapter") + + def new_module_like(self, module): + module_device = get_device(module) + adapterlayer = LowRankAdapter(reduction_factor = self.reduction_factor, + non_linearity = self.non_linearity, + low_rank_w_init = self.low_rank_w_init, + low_rank_rank = self.low_rank_rank, + device=module_device, backend=self.backend) + self.delta_modules.append(adapterlayer) + return adapterlayer diff --git a/OpenDelta-0.3.2/opendelta/delta_models/parallel_adapter.py b/OpenDelta-0.3.2/opendelta/delta_models/parallel_adapter.py new file mode 100644 index 0000000..d354587 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/delta_models/parallel_adapter.py @@ -0,0 +1,208 @@ +from functools import partial +from random import random +from typing import Optional, Union +from opendelta.utils.signature import get_arg_names_inside_func +from opendelta.utils.name_based_addressing import * +from opendelta.utils.cuda import get_device +from opendelta.basemodel import DeltaBase +import torch.nn as nn +import torch +from opendelta.delta_models.layers.activations import Activations +from opendelta import BaseDeltaConfig +import opendelta.utils.logging as logging +logger = logging.get_logger(__name__) + +class ParallelAdapterLayer(nn.Module): + r"""A layer of adapter tuning module. + """ + layer_count = 0 + + @classmethod + def count_layer(cls): + cls.layer_count += 1 + + @classmethod + def get_layer_count(cls): + return cls.layer_count + + def __init__(self, bottleneck_dim=24, non_linearity='gelu_new', scaled=1, device=None, backend='hf'): + super().__init__() + self.bottleneck_dim = bottleneck_dim + self.device = device + self.instantiated = False + self.non_linearity = non_linearity + self.scaled = scaled + self.backend = backend + + self.layer_id = ParallelAdapterLayer.get_layer_count() + ParallelAdapterLayer.count_layer() + + + def instantiate(self, hiddens): + self.hidden_dim = hiddens.shape[-1] + self.hidden_dtype = hiddens.dtype + self.modulelist = nn.Sequential() + self.modulelist.add_module("down_proj",nn.Linear(self.hidden_dim, self.bottleneck_dim, device=self.device, dtype=self.hidden_dtype)) + + # select non-linearity + self.modulelist.add_module("non_linear", Activations(self.non_linearity.lower())) + + self.modulelist.add_module("up_proj", nn.Linear(self.bottleneck_dim, self.hidden_dim, device=self.device, dtype=self.hidden_dtype)) + + self.instantiated = True + # initialize the weight, which is important for fast convergence and better performance. + self.apply(self._init_weight) + if self.backend == 'bmt': + import bmtrain as bmt + self.modulelist = bmt.BMTrainModelWrapper(self.modulelist) + + def _init_weight(self, module): + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=0.01) + if module.bias is not None: + module.bias.data.zero_() + + + def pre_forward(self, *args, **kwargs): + r""" Get the hidden_states from the PLM's layer output, pass it into the adapter, + then combined with the main hidden_states. Finally pass it into the subsequent layer. + + """ + if isinstance(args, tuple): + hiddens = args[0] + elif isinstance(args, torch.Tensor): + hiddens = args + else: + raise TypeError + + + if not self.instantiated: + # logger.debug(f"Got hidden dim hidden_dim {self.hidden_dim}") + self.instantiate(hiddens = hiddens) + + + self.adapter_output = self.modulelist(hiddens) * self.scaled + return args, kwargs + + def post_forward(self, output, **kwargs): + if isinstance(output, tuple): + hidden = output[0] + elif isinstance(output, torch.Tensor): + hidden = output + else: + raise TypeError + + modified_output = self.adapter_output + hidden + if isinstance(output, tuple): + output = (modified_output,) + output[1:] + elif isinstance(output, torch.Tensor): + output = modified_output + else: + raise TypeError + return output + + + +class ParallelAdapterConfig(BaseDeltaConfig): + r""" + This is the configuration class to store the configuration of a :py:class:`~ParallelAdapterModel` + + """ + def __init__( + self, + bottleneck_dim: Optional[int]=24, + non_linearity: Optional[str]='gelu_new', + scaled: Optional[float]=1., + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + + + +class ParallelAdapterModel(DeltaBase): + r""" The implementation of Parallel Adapter(`TOWARDS A UNIFIED VIEW OF PARAMETER-EFFICIENT TRANSFER LEARNING `_ ) . + Add adapter to the designated ``modified_modules``. In parallel paradigm, The modules' output is then passed into the adapter's + post_forward. + + .. note:: + We **assume** the output of the modified module is the hidden state or a tuple where hidden state is the + first element. This is true for most PLMs. However, we admit that currently it's not rigorous, We will improve + it in the next version. Currently, if you encount an error here for you backbone, you can modify the code to + get the hidden state. + + class attributes: + - default_modified_modules = ["attn", "ff"] According to the Adapter paper, we add adapter to the attention layer + and feed forward layer. + - delta_type = "adapter" + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + bottleneck_dim (:obj:`int`): The dimension of the adapter's bottleneck. + non_linearity (:obj:`str`): The non linearity of the adapter. + modified_modules (:obj:`List[str]`): modules to add parallel adapter. Must be paired and have the save order in layer. For examples, ["attn", "attn", "ff.w1", "ff.w2"] add one parallel adapter from attn's input to attn's output, and another one from ff.w1's input to ff.w2's output. + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen together with the parallel adapter parameters. + common_structure (:obj:`bool`): whether using name-based addressing witha common structure mapping. + backend (:obj:`str`): choose the backend of plm, 'hf' for huggingface transformers,'bmt' for bmtrain + + """ + config_class = ParallelAdapterConfig + delta_type = "parallel_adapter" + default_modified_modules = ["attn@", "attn@", "ff@.w1@", "ff@.w2@"] + # default_modified_modules = ["attn", "attn", "ff.w1", "ff.w2"] + _supported_backends = ['hf', 'bmt'] + _need_pseudo_data = True + def __init__(self, + backbone_model: nn.Module, + bottleneck_dim: Optional[int]=24, + non_linearity: Optional[str]='gelu_new', + modified_modules: Optional[bool] = None, + exclude_modules: Optional[List[str]] = None, + unfrozen_modules: Optional[bool] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + backend: Optional[str] = "hf", + ): + DeltaBase.__init__(self, + backbone_model, + modified_modules=modified_modules, + exclude_modules=exclude_modules, + unfrozen_modules=unfrozen_modules, + common_structure=common_structure, + interactive_modify=interactive_modify, + backend=backend, + ) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + self.delta_modules = nn.ModuleList() + + self.ith = 0 + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + + def update_module(self, module: nn.Module, key: str): + _, _, ref = self.find_module(module, key) + if self.ith % 2 == 0: + adapterlayer = self.new_module_like(ref) + self.insert_module(ref, "before", delta_module=adapterlayer, delta_name="parallel_adapter") + if self.ith % 2 == 1 or self.modified_modules[self.ith] == self.modified_modules[self.ith + 1]: + adapterlayer = self.delta_modules[-1] + self.insert_module(ref, "after", delta_module=adapterlayer, delta_name="parallel_adapter") + self.ith |= 1 + self.ith += 1 + self.ith %= len(self.modified_modules) + + def new_module_like(self, module): + module_device = get_device(module) + adapterlayer = ParallelAdapterLayer(bottleneck_dim=self.bottleneck_dim, non_linearity=self.non_linearity, device=module_device, backend=self.backend) + self.delta_modules.append(adapterlayer) + return adapterlayer + \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/delta_models/prefix.py b/OpenDelta-0.3.2/opendelta/delta_models/prefix.py new file mode 100644 index 0000000..777a8e1 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/delta_models/prefix.py @@ -0,0 +1,628 @@ +from functools import partial +from opendelta.delta_configs import BaseDeltaConfig +from opendelta.utils.signature import get_arg_names_inside_func, signature +from typing import Optional, Union +from transformers.models.distilbert.modeling_distilbert import MultiHeadSelfAttention +from transformers.models.t5.modeling_t5 import T5Attention, T5LayerSelfAttention +from transformers.models.bert.modeling_bert import BertAttention +from transformers.models.gpt2.modeling_gpt2 import GPT2Attention +from transformers.models.bart.modeling_bart import BartAttention +from transformers.models.roberta.modeling_roberta import RobertaAttention +from opendelta.utils.name_based_addressing import * +from opendelta.utils.cuda import get_device +from opendelta.basemodel import DeltaBase +from transformers.models.t5 import T5ForConditionalGeneration +import torch.nn as nn +import torch +import opendelta.utils.logging as logging +logger = logging.get_logger(__name__) + + +class PrefixLayerT5(nn.Module): + r"""A layer of prefix tuning module. The layer's forward function pass (or concatenate) the additional past_key_value + into the original attention layer's forward function. + """ + def __init__(self, prefix_token_num, num_heads, device,): + super().__init__() + self.prefix_token_num = prefix_token_num + self.num_heads = num_heads + self.device = device + self.instantiated = False + + def instantiate(self, hidden_dim): + self.past_key = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_value = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_key_reparam = None + self.past_value_reparam = None + self.instantiated = True + + + def pre_forward(self, *args, **kwargs): + r"""The args and kwargs are inherited from the T5Attention's forward function. + """ + batch_size = args[0].shape[0] + seq_len = args[0].shape[-2] + if not self.instantiated: + self.hidden_dim = args[0].shape[-1] + self.instantiate(hidden_dim=self.hidden_dim) + if self.past_key_reparam is None: + past_key = self.past_key + else: + past_key = self.past_key_reparam + if self.past_value_reparam is None: + past_value = self.past_value + else: + past_value = self.past_value_reparam + + + def expand_batchsize(x): + x = x.reshape(self.prefix_token_num, self.num_heads, -1).transpose(0,1) + x = x.unsqueeze(0).expand(batch_size, *x.shape) + return x + + if 'position_bias' in kwargs and kwargs['position_bias'] is not None: + if kwargs['position_bias'].shape[-1] != seq_len + self.prefix_token_num: # Then the position_bias should be re-calculated + kwargs['position_bias'] = None + if kwargs['past_key_value'] is None: + kwargs['past_key_value'] = (expand_batchsize(past_key), expand_batchsize(past_value)) + + past_key_len = kwargs['past_key_value'][0].shape[-2] + + if 'mask' in kwargs and kwargs['mask'] is not None: + mask_len = kwargs['mask'].shape[-1] + if past_key_len + seq_len == mask_len + self.prefix_token_num: + + am = kwargs['mask'] # Should check the format of the attention_mask when moving to a new plm. + kwargs['mask'] = torch.cat([-torch.zeros((*am.shape[:-1],self.prefix_token_num), dtype = am.dtype,device=am.device), am], dim=-1) + return args, kwargs + + def post_forward(self, output): + r""" Remove the cached positional bias, since the next layer may not have prefix token. + """ + output = output[:2] + (None, )+ output[3:] + return output + +class PrefixLayerBart(nn.Module): + r"""A layer of prefix tuning module. The layer's forward function pass (or concatenate) the additional past_key_value + into the original attention layer's forward function. + """ + def __init__(self, prefix_token_num, num_heads, device,): + super().__init__() + self.prefix_token_num = prefix_token_num + self.num_heads = num_heads + self.device = device + self.instantiated = False + + def instantiate(self, hidden_dim): + self.past_key = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_value = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_key_reparam = None + self.past_value_reparam = None + self.instantiated = True + + + def pre_forward(self, *args, **kwargs): + r"""The args and kwargs are inherited from the T5Attention's forward function. + """ + + batch_size = kwargs['hidden_states'].shape[0] + if not self.instantiated: + self.hidden_dim = kwargs['hidden_states'].shape[-1] + self.instantiate(hidden_dim=self.hidden_dim) + if self.past_key_reparam is None: + past_key = self.past_key + else: + past_key = self.past_key_reparam + if self.past_value_reparam is None: + past_value = self.past_value + else: + past_value = self.past_value_reparam + + # from IPython import embed + # embed() + def expand_batchsize(x): + x = x.reshape(self.prefix_token_num, self.num_heads, -1).transpose(0,1) + x = x.unsqueeze(0).expand(batch_size, *x.shape) + return x + # from IPython import embe + + if 'past_key_value' not in kwargs or kwargs['past_key_value'] is None: + kwargs['past_key_value'] = (expand_batchsize(past_key), expand_batchsize(past_value)) + + if 'attention_mask' in kwargs and kwargs['attention_mask'] is not None: + am = kwargs['attention_mask'] # Should check the format of the attention_mask when moving to a new plm. + kwargs['attention_mask'] = torch.cat([-torch.zeros((*am.shape[:-1],self.prefix_token_num), dtype = am.dtype,device=am.device), am], dim=-1) + return args, kwargs + + +class PrefixLayerGPT2(nn.Module): + r"""A layer of prefix tuning module. The layer's forward function pass (or concatenate) the additional past_key_value + into the original attention layer's forward function. + """ + def __init__(self, prefix_token_num, num_heads, device,): + super().__init__() + self.prefix_token_num = prefix_token_num + self.num_heads = num_heads + self.device = device + self.instantiated = False + + def instantiate(self, hidden_dim): + self.past_key = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_value = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_key_reparam = None + self.past_value_reparam = None + self.instantiated = True + + + def pre_forward(self, *args, **kwargs): + r"""The args and kwargs are inherited from the T5Attention's forward function. + """ + batch_size = args[0].shape[0] + if not self.instantiated: + self.hidden_dim = args[0].shape[-1] + self.instantiate(hidden_dim=self.hidden_dim) + if self.past_key_reparam is None: + past_key = self.past_key + else: + past_key = self.past_key_reparam + if self.past_value_reparam is None: + past_value = self.past_value + else: + past_value = self.past_value_reparam + + def expand_batchsize(x): + x = x.reshape(self.prefix_token_num, self.num_heads, -1).transpose(0,1) + x = x.unsqueeze(0).expand(batch_size, *x.shape) + return x + + + if kwargs['layer_past'] is None: + kwargs['layer_past'] = (expand_batchsize(past_key), expand_batchsize(past_value)) + if 'attention_mask' in kwargs and kwargs['attention_mask'] is not None: + am = kwargs['attention_mask'] # Should check the format of the attention_mask when moving to a new plm. + kwargs['attention_mask'] = torch.cat([-torch.zeros((*am.shape[:-1],self.prefix_token_num), dtype = am.dtype,device=am.device), am], dim=-1) + return args, kwargs + + + +class PrefixLayerDistilBert(nn.Module): + # TODO: Warning: have bugs + def __init__(self, prefix_token_num, device,): + super().__init__() + self.prefix_token_num = prefix_token_num + self.device = device + self.key_instantiated = False + self.value_instantiated = False + + def forward(self, *args, **kwargs): + mask = kwargs["mask"] + key, value = kwargs['key'], kwargs['value'] + prefix_mask = torch.ones(mask.shape[0], self.prefix_token_num, dtype=mask.dtype, device=mask.device) + concated_mask = torch.cat([prefix_mask, mask], dim=1) + pseudo_prefix = torch.zeros(key.shape[0], self.prefix_token_num, key.shape[2], dtype=key.dtype, device=key.device) + concated_key = torch.cat([pseudo_prefix, key], dim=1) + concated_value = torch.cat([pseudo_prefix, value], dim=1) + kwargs["mask"] = concated_mask + kwargs['key'] = concated_key + kwargs['value'] = concated_value + return args, kwargs + + + def key_instantiate(self, hidden_dim): + self.past_key = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_key_reparam = None + self.key_instantiated = True + + def value_instantiate(self, hidden_dim): + self.past_value = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_value_reparam = None + self.value_instantiated = True + + def key_pre_forward(self, *args, **kwargs): + _input = args[0] + _input = _input[:,self.prefix_token_num:, :] + args = (_input,) +args[1:] + return args, kwargs + + def value_pre_forward(self, *args, **kwargs): + _input = args[0] + _input = _input[:,self.prefix_token_num:, :] + args = (_input,) +args[1:] + return args, kwargs + + def key_forward(self, output: torch.Tensor): ### Check whether run prefix is ok, 12.21 + if isinstance(output, torch.Tensor): + hiddens = output + else: + raise TypeError + if not self.key_instantiated: + self.hidden_dim = hiddens.shape[-1] + logger.debug(f"Got key hidden dim hidden_dim {self.hidden_dim}") + self.key_instantiate(hidden_dim=self.hidden_dim) + batch_size = hiddens.shape[0] + if self.past_key_reparam is None: + past_key = self.past_key + else: + past_key = self.past_key_reparam + output = torch.cat([past_key.unsqueeze(0).expand(batch_size, *past_key.shape), hiddens], dim=1) + return output + + def value_forward(self, output: torch.Tensor): ### Check whether run prefix is ok, 12.21 + if isinstance(output, torch.Tensor): + hiddens = output + else: + raise TypeError + if not self.value_instantiated: + self.hidden_dim = hiddens.shape[-1] + logger.debug(f"Got value hidden dim hidden_dim {self.hidden_dim}") + self.value_instantiate(hidden_dim=self.hidden_dim) + batch_size = hiddens.shape[0] + if self.past_value_reparam is None: + past_value = self.past_value + else: + past_value = self.past_value_reparam + output = torch.cat([past_value.unsqueeze(0).expand(batch_size, *past_value.shape), hiddens], dim=1) + return output + + +class PrefixLayerBert(nn.Module): + r"""A layer of prefix tuning module. The layer's forward function pass (or concatenate) the additional past_key_value + into the original attention layer's forward function. + """ + def __init__(self, prefix_token_num, num_heads, device,): + super().__init__() + self.prefix_token_num = prefix_token_num + self.num_heads = num_heads + self.device = device + self.instantiated = False + + def instantiate(self, hidden_dim): + self.past_key = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_value = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_key_reparam = None + self.past_value_reparam = None + self.instantiated = True + + + def pre_forward(self, *args, **kwargs): + r"""The args and kwargs are inherited from the T5Attention's forward function. + """ + batch_size = args[0].shape[0] + if not self.instantiated: + self.hidden_dim = args[0].shape[-1] + self.instantiate(hidden_dim=self.hidden_dim) + if self.past_key_reparam is None: + past_key = self.past_key + else: + past_key = self.past_key_reparam + if self.past_value_reparam is None: + past_value = self.past_value + else: + past_value = self.past_value_reparam + + + def expand_batchsize(x): + x = x.reshape(self.prefix_token_num, self.num_heads, -1).transpose(0,1) + x = x.unsqueeze(0).expand(batch_size, *x.shape) + return x + # from IPython import embe + + if 'past_key_value' not in kwargs or kwargs['past_key_value'] is None: + kwargs['past_key_value'] = (expand_batchsize(past_key), expand_batchsize(past_value)) + + if 'attention_mask' in kwargs and kwargs['attention_mask'] is not None: + am = kwargs['attention_mask'] # Should check the format of the attention_mask when moving to a new plm. + kwargs['attention_mask'] = torch.cat([-torch.zeros((*am.shape[:-1],self.prefix_token_num), dtype = am.dtype,device=am.device), am], dim=-1) + elif len(args) >1: # attention mask is passed via positional argument + am = args[1] + am = torch.cat([-torch.zeros((*am.shape[:-1],self.prefix_token_num), dtype = am.dtype,device=am.device), am], dim=-1) + args = (args[0], am) + args[2:] + # from IPython import embed + # embed(header = "Herein prefixroberta") + return args, kwargs + + + +class PrefixLayerRoberta(nn.Module): + r"""A layer of prefix tuning module. The layer's forward function pass (or concatenate) the additional past_key_value + into the original attention layer's forward function. + """ + def __init__(self, prefix_token_num, num_heads, device,): + super().__init__() + self.prefix_token_num = prefix_token_num + self.num_heads = num_heads + self.device = device + self.instantiated = False + + def instantiate(self, hidden_dim): + self.past_key = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_value = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_key_reparam = None + self.past_value_reparam = None + self.instantiated = True + + + def pre_forward(self, *args, **kwargs): + r"""The args and kwargs are inherited from the T5Attention's forward function. + """ + batch_size = args[0].shape[0] + if not self.instantiated: + self.hidden_dim = args[0].shape[-1] + self.instantiate(hidden_dim=self.hidden_dim) + if self.past_key_reparam is None: + past_key = self.past_key + else: + past_key = self.past_key_reparam + if self.past_value_reparam is None: + past_value = self.past_value + else: + past_value = self.past_value_reparam + + # from IPython import embed + # embed() + def expand_batchsize(x): + x = x.reshape(self.prefix_token_num, self.num_heads, -1).transpose(0,1) + x = x.unsqueeze(0).expand(batch_size, *x.shape) + return x + # from IPython import embe + + if 'past_key_value' not in kwargs or kwargs['past_key_value'] is None: + kwargs['past_key_value'] = (expand_batchsize(past_key), expand_batchsize(past_value)) + + if 'attention_mask' in kwargs and kwargs['attention_mask'] is not None: + am = kwargs['attention_mask'] # Should check the format of the attention_mask when moving to a new plm. + kwargs['attention_mask'] = torch.cat([-torch.zeros((*am.shape[:-1],self.prefix_token_num), dtype = am.dtype,device=am.device), am], dim=-1) + elif len(args) >1: # attention mask is passed via positional argument + am = args[1] + am = torch.cat([-torch.zeros((*am.shape[:-1],self.prefix_token_num), dtype = am.dtype,device=am.device), am], dim=-1) + args = (args[0], am) + args[2:] + # from IPython import embed + # embed(header = "Herein prefixroberta") + return args, kwargs + + + + # def post_forward(self, output): + # r""" Remove the cached positional bias, since the next layer may not have prefix token. + # """ + # output = output[:2] + (None, )+ output[3:] + # return output + + +class ReparameterizeFunction(nn.Module): + r""" Prefix Tuning's performance is better with a reparameterize module, which generates + the ``past_key_value`` using an MLP instead of directly optimizing the ``past_key_value`` as leaf variable. + In our implementation, the reparameterize module is constructed according to the number of parameters + in all ``past_key_value``s. Thus, variable number of prefixlayer is supported (not restricting to being equal + to the number of layers of the pretraind language model) + + + """ + def __init__(self, prefix_token_num, embed_dim, dropout_rate=0.0, mid_dim=512, module_list=[]): + super().__init__() + self.prefix_token_num = prefix_token_num + self.embed_dim = embed_dim + self.mid_dim = mid_dim + self.module_list = module_list + self.dropout = nn.Dropout(dropout_rate) + self.record_parameters() + self.compatibility_check() + self.define_reparameterization_network() + + def record_parameters(self): + r""" Enumerate the parameters that need to be reparameterized. + Then, delete the original parameters. + """ + tot = 0 + for module in self.module_list: + for n, parameters in module.named_parameters(): + tot += parameters.numel() + module.register_parameter(n, None) + self.total_parameters_num = tot + + def compatibility_check(self,): + r"""May be removed. + """ + assert self.total_parameters_num % self.prefix_token_num == 0 + + def allocate_parameter(self): + r""" At the beginning of each forward pass through the whole network(PLM), + cacalulate the reparameterized past_key and past_value (``past_key_reparam`` and ``past_value_reparam``) + for later use in each layer. + """ + input_tokens = self.input_tokens + temp_control = self.wte(input_tokens) + past_key_values = self.control_trans(temp_control) + seqlen, _ = past_key_values.shape + + past_key_values = past_key_values.view(seqlen, len(self.module_list) * 2, self.module_list[0].hidden_dim) + past_key_values = self.dropout(past_key_values) + past_key_values = past_key_values.permute([1, 0, 2]).split(2) + + for module_id, module in enumerate(self.module_list): + module.past_key_reparam = past_key_values[module_id][0] + module.past_value_reparam = past_key_values[module_id][1] + + def pre_forward(self, *args, **kwargs): + r""" Firstly forward through the reparameterized network, and then go through normal forward pass of the PLM. + """ + self.allocate_parameter() + return args, kwargs + + def define_reparameterization_network(self) -> None: + r""" Build the reparameterize module + """ + self.input_tokens = nn.Parameter(torch.arange(self.prefix_token_num).long(), requires_grad=False) # to allow automatic devicing + self.wte = nn.Embedding(self.prefix_token_num, self.embed_dim) + self.control_trans = nn.Sequential( + nn.Linear(self.embed_dim, self.mid_dim), + nn.Tanh(), + nn.Linear(self.mid_dim, self.total_parameters_num//self.prefix_token_num) + ) + + +class PrefixConfig(BaseDeltaConfig): + def __init__( + self, + prefix_token_num=6, + reparameterize=True, + embed_dim: Optional[int]=512, + mid_dim: Optional[int]=512, + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + + + + + +class PrefixModel(DeltaBase): + r""" The implementation of `Prefix-Tuning: Optimizing Continuous Prompts for Generation `_ . + However, as attention block of different PLM differs substantially, e.g., the input arguments, the name convention + of ``past_key_value``, we have to implement different prefixlayer for different PLM. Given the inconvenience in the + code level, we only support several commonly used backbone models (Currently: T5, DistilBert,Bert, Roberta, GPT2, + BART). If you are trying to apply delta tuning to other backbone models, we suggest you trying other delta models + or implementing it and making a pull request. + + Experimental Feature: + + Support inserting prefix token before each layer. For example, layer 3 4 6 10 and other layer untouched. + + .. note:: + + If using reparameterize, the parameters will be in a reparameterization network, not in the prefix, which + we attach to the first prefix layer. We will add a function to save only the generated prefix parameters for + saving in the next version. + + + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + prefix_token_num (:obj:`int`): the number of prefix token + reparameterize (:obj:`bool`): Whether use the reparameterization for prefix tuning. + embed_dim (:obj:`int`): The embeding dimension of prefix token when using the reparameterization. + mid_dim (:obj:`int`): The dimension of the hiddens of the reparameterization network. + modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only + the implemented ones) + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen + together with the prefix parameters. + common_structure (:obj:`bool`): whether using name-based addressing with a common structure mapping. + + """ + config_class = PrefixConfig + delta_type = "prefix" + default_modified_modules = ['attn@'] + _supported_backends = ['hf'] + _need_pseudo_data = True + def __init__(self, + backbone_model: nn.Module, + prefix_token_num=6, + reparameterize=True, + embed_dim: Optional[int]=512, + mid_dim: Optional[int]=512, + modified_modules: Optional[List[str]] = None, + exclude_modules: Optional[List[str]] = None, + unfrozen_modules: Optional[List[str]] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + ): + DeltaBase.__init__(self, + backbone_model, + modified_modules=modified_modules, + exclude_modules=exclude_modules, + unfrozen_modules=unfrozen_modules, + common_structure=common_structure, + interactive_modify=interactive_modify, + ) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + self.delta_modules = nn.ModuleList() + + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + def add_all_delta_to_backbone(self, + module: nn.Module, + modified_modules: List[str], + ) -> nn.Module: + first_modified_module = None + # Current, We assume the layerer are in order in named_modules. + # Thus the first modified module is the first module that the tensor flows to. + for key, _ in module.named_modules(): + if self.find_key(key, modified_modules): + logger.debug("find key {}".format(key)) + if first_modified_module is None: + _, _, ref = self.find_module(module, key) + first_modified_module = ref + self.update_module(module, key) + + self._pseudo_data_to_instantiate(module) + + if self.reparameterize: + reparams = ReparameterizeFunction(prefix_token_num=self.prefix_token_num, + embed_dim=self.embed_dim, + mid_dim=self.mid_dim, + module_list=self.delta_modules) + self.delta_modules = None + self.reparams = reparams + self.insert_sequential_module(first_modified_module, delta_module=reparams, delta_name="reparams", strict=False) + self.mark_as_delta() + return module + + + + def update_module(self, module: nn.Module, key: str): + _, _, ref = self.find_module(module, key) + + prefixlayer, ref = self.new_module_like(ref) + self.insert_sequential_module(ref, delta_module=prefixlayer, delta_name="prefix") + self.delta_modules.append(prefixlayer) + + def new_module_like(self, module): + # TODO: support more Attention modules + + if isinstance(module, T5Attention) or isinstance(module, T5LayerSelfAttention): + if isinstance(module, T5LayerSelfAttention): + module = module.SelfAttention # innermodule + module_device = get_device(module) + prefixlayer = PrefixLayerT5(prefix_token_num=self.prefix_token_num, num_heads=module.n_heads ,device=module_device) + elif isinstance(module, MultiHeadSelfAttention): # MultiHeadSelfAttention didn't provide past_key_value in the interface of the forward function. + module_device = get_device(module) + prefixlayer = PrefixLayerDistilBert(prefix_token_num=self.prefix_token_num, device=module_device) + self.insert_sequential_module(getattr(module, "k_lin"), pre_caller=prefixlayer.key_pre_forward, post_caller=prefixlayer.key_forward) + self.insert_sequential_module(getattr(module, "v_lin"), pre_caller=prefixlayer.value_pre_forward, post_caller=prefixlayer.value_forward) + elif isinstance(module, BertAttention): + module_device = get_device(module) + prefixlayer = PrefixLayerBert(prefix_token_num=self.prefix_token_num, num_heads=module.self.num_attention_heads ,device=module_device) + elif isinstance(module, RobertaAttention): + module_device = get_device(module) + prefixlayer = PrefixLayerRoberta(prefix_token_num=self.prefix_token_num, num_heads=module.self.num_attention_heads,device=module_device) + elif isinstance(module, GPT2Attention): + module_device = get_device(module) + prefixlayer = PrefixLayerGPT2(prefix_token_num=self.prefix_token_num, num_heads=module.num_heads ,device=module_device) + elif isinstance(module, BartAttention): + module_device = get_device(module) + prefixlayer = PrefixLayerBart(prefix_token_num=self.prefix_token_num, num_heads=module.num_heads ,device=module_device) + else: + raise NotImplementedError(f"We haven't implement Prefix Tuning Layer for {module.__class__.__name__}. Please refer to https://opendelta.readthedocs.io/en/latest/notes/faq.html for detail.") + return prefixlayer, module + + + + + + + + + + + + diff --git a/OpenDelta-0.3.2/opendelta/delta_models/soft_prompt.py b/OpenDelta-0.3.2/opendelta/delta_models/soft_prompt.py new file mode 100644 index 0000000..b7d7692 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/delta_models/soft_prompt.py @@ -0,0 +1,230 @@ +from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func +from opendelta.utils.name_based_addressing import * +from opendelta.utils.cuda import get_device +from opendelta.basemodel import DeltaBase +from typing import * +import torch +import torch.nn as nn +from opendelta import BaseDeltaConfig +from opendelta import logging +logger = logging.get_logger(__name__) + +class SoftPromptConfig(BaseDeltaConfig): + r""" + This is the configuration class to store the configuration of a :py:class:`SoftPromptModel` + + """ + def __init__( + self, + soft_token_num=100, + init_range = 0.5, + token_init = True, + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + + + +class SoftPromptLayer(nn.Module): + r"""This is the implementation of `The Power of Scale for Parameter-Efficient + Prompt Tuning `_ . Similar to :obj:`PrefixTuningTemplate`, + This template also does not need any textual template. Addition tokens are directly + concatenated into the input ids. There are two initializations of the new tokens. + (1). random initialization. (2) initialize with the tokens of the plm (We simply take + the first n_tokens similar to their implementation). + + Note that this template can be simply achieved by :obj:`SoftManualTemplate`, in which + you set ``n_token`` tokens template before the will give the same result. + """ + + def __init__(self, + soft_token_num: int = 100, + raw_embedding: Optional[torch.Tensor] = None, + init_range: Optional[float] = 0.5, + other_expand_ids: Optional[Dict] = {"attention_mask":1, "token_type_ids":0}, + token_init = False, + pad_id = 0, + device: Optional[str]=None, + ): + super().__init__() + self.__dict__['raw_embedding'] = raw_embedding + + self.init_range = init_range + self.num_tokens = soft_token_num + self.pad_id = pad_id + self.token_init = token_init + self.device = device + self.other_expand_ids = other_expand_ids + + assert self.num_tokens>0 + self.instantiate(raw_embedding(torch.tensor([0])).shape[-1]) + + # self.all_pseudo_tokens = {} + + def pre_forward(self, *args, **kwargs): + # if attention_mask is passed as PLM's input, modify it here + if 'encoder_outputs' in kwargs and kwargs['encoder_outputs'] is not None: + # In generation, the input is forward through the model again. + return args, kwargs + + if 'input_ids' in kwargs: + input_ids = kwargs['input_ids'] + kwargs['input_ids'] = None + elif len(args) > 0: + input_ids = args[0] + args = args[1:] + else: + input_ids = None + + + if 'attention_mask' not in kwargs or kwargs['attention_mask'] is None: + # infer attention mask + if input_ids is None: + raise RuntimeError("no input ids found") + kwargs['attention_mask'] = (input_ids != self.pad_id).to(torch.int64) + + if 'inputs_embeds' not in kwargs or kwargs['inputs_embeds'] is None: + try: + inputs_embeds = self.raw_embedding(input_ids) + except: + raise RuntimeError("neither inputs_embeds nor input_ids is specified.") + else: + inputs_embeds = kwargs['inputs_embeds'] + + + + batch_size = inputs_embeds.size(0) + soft_embeds = self.soft_embeds.repeat(batch_size, 1, 1) + inputs_embeds = torch.cat([soft_embeds, inputs_embeds], 1) + kwargs['inputs_embeds'] = inputs_embeds + + for expand_key in self.other_expand_ids: + if expand_key in kwargs: + real_tokens = kwargs[expand_key] + # if expand_key in self.all_pseudo_tokens: + # pseudo_tokens = self.all_pseudo_tokens[expand_key].to(real_tokens.device) + # else: + pseudo_tokens_value = self.other_expand_ids[expand_key] + pseudo_tokens = torch.ones( + (*real_tokens.shape[:-1], inputs_embeds.shape[-2]-real_tokens.shape[-1]), + dtype = real_tokens.dtype, + device=real_tokens.device) * pseudo_tokens_value + # self.all_pseudo_tokens[expand_key] = pseudo_tokens + real_tokens.data = torch.cat([pseudo_tokens, real_tokens], dim=-1) + + return args, kwargs + + def instantiate(self, hidden_dim) -> None: + """ + generate parameters needed for soft tokens embedding in soft-prompt + for soft tokens, use a new embedding layer which is initialized with their corresponding embedding of hard tokens + """ + soft_embeds = torch.FloatTensor(self.num_tokens, hidden_dim) + if self.token_init: + soft_embeds.data = torch.clone(self.raw_embedding(torch.tensor([i for i in range(self.num_tokens)]))) + else: + soft_embeds = soft_embeds.uniform_(-self.init_range, self.init_range) + + self.soft_embeds = nn.Parameter(soft_embeds, requires_grad=True).to(self.device) + + +class SoftPromptModel(DeltaBase): + r""" + This is the implementation of `The Power of Scale for Parameter-Efficient + Prompt Tuning `_ . Similar to :obj:`PrefixTuningTemplate`, + This template also does not need any textual template. Addition tokens are directly + concatenated into the input ids. There are two initializations of the new tokens. + (1). random initialization. (2) initialize with the tokens of the plm (We simply take + the first n_tokens similar to their implementation). + + Note that this template can be simply achieved by :obj:`SoftManualTemplate`, in which + you set ``n_token`` tokens template before the will give the same result. + + Args: + + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + soft_token_num (:obj:`int`, *optional*): num of new tokens to add in the front of the input. + init_range (:obj:`float`, *optional*): If initialize new tokens randomly, the random range of uniform distribution. + token_init (:obj:`bool`, *optional*, default to :obj:`True`): Whether to initialize the new tokens with tokens of the PLM. + other_expand_ids (:obj:`dict`, *optional*, default to ``{'attention_mask':1, 'token_type_ids':0}``): The name of other tokens and its default value that expand along with the input sequence. For example, when you prepend 100 tokens to the input_ids, the attention_mask should be extended, and the token_type_ids should be extended as well. + modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only the implemented ones). + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen together with the prefix parameters. + common_structure (:obj:`bool`): whether using name-based addressing with a common structure mapping. + + + """ + + config_class = SoftPromptConfig + delta_type = "soft_prompt" + default_modified_modules = ["root"] # not used + _supported_backends = ['hf', 'bmt'] + _need_pseudo_data = False + def __init__(self, + backbone_model: nn.Module, + soft_token_num=100, + init_range = 0.5, + token_init=True, + other_expand_ids={"attention_mask":1, "token_type_ids":0}, + modified_modules: Optional[List[str]] = None, + exclude_modules: Optional[List[str]] = None, + unfrozen_modules: Optional[List[str]] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + ): + DeltaBase.__init__(self, + backbone_model = backbone_model, + modified_modules = ["root"], + exclude_modules = exclude_modules, + unfrozen_modules = unfrozen_modules, + common_structure = False, + interactive_modify = interactive_modify, + ) + + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + + try: + self.__dict__['raw_embedding'] = self.backbone_model.get_input_embeddings() + except AttributeError: + raise AttributeError(f"'{type(self.backbone_model)}' object has no attribute 'get_input_embeddings', please pass "+ + "input embeddings into 'self.raw_embedding' in user-specific ways.") + + self.delta_modules = nn.ModuleList() + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + def add_all_delta_to_backbone(self, + module: nn.Module, + modified_modules: List[str], + ) -> nn.Module: + self.update_module() + self.mark_as_delta() + return module + + def update_module(self): + soft_prompt_layer = self.new_module_like(self.raw_embedding) + self.insert_sequential_module(self.backbone_model.get_encoder() if self.backbone_model.config.is_encoder_decoder else self.backbone_model,delta_module=soft_prompt_layer,delta_name="soft_prompt_layer" ) + + def new_module_like(self, module): + module_device = get_device(module) + soft_prompt_layer = SoftPromptLayer( + soft_token_num = self.soft_token_num, + raw_embedding = self.raw_embedding, + other_expand_ids = self.other_expand_ids, + token_init = self.token_init, + init_range = self.init_range, + device = module_device, + ) + if self.backend == 'bmt': + import bmtrain as bmt + soft_prompt_layer = bmt.BMTrainModelWrapper(soft_prompt_layer) + self.delta_modules.append(soft_prompt_layer) + return soft_prompt_layer diff --git a/OpenDelta-0.3.2/opendelta/utils/__init__.py b/OpenDelta-0.3.2/opendelta/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/OpenDelta-0.3.2/opendelta/utils/backend.py b/OpenDelta-0.3.2/opendelta/utils/backend.py new file mode 100644 index 0000000..0b5b124 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/backend.py @@ -0,0 +1,110 @@ + + +import importlib + + +class BackendMapping: + """ + " A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed. + + Args: + + - config_mapping: The map model type to config class + - model_mapping: The map model type to model (or tokenizer) class + """ + + def __init__(self, backend): + self.backend = backend + assert backend in ['hf', 'bmt'], "Backend should be one of 'hf', 'bmt'. " + if backend == 'hf': + self.backend_mapping = { + "linear": "torch.nn.Linear", + "layer_norm": "torch.nn.LayerNorm", + "module": "torch.nn.Module", + "parameter": "torch.nn.Parameter" + } + elif backend == 'bmt': + self.backend_mapping = { + "linear": "model_center.layer.Linear", + "layer_norm": "model_center.layer.LayerNorm", + "module": "bmtrain.layer.DistributedModule", + "parameter": "bmtrain.nn.DistributedParameter" + } + self.registered = {} + + def load(self, model_type): + if model_type not in self.registered: + splited = self.backend_mapping[model_type].split(".") + module_name, class_name = ".".join(splited[:-1]), splited[-1] + module = importlib.import_module(module_name) + the_class = getattr(module, class_name) + self.registered[model_type] = the_class + return self.registered[model_type] + + def check_type(self, module, expect_type): + the_class = self.load(expect_type) + if isinstance(module, the_class): + return True + else: + return False + + + # def keys(self): + # mapping_keys = [ + # self._load_attr_from_module(key, name) + # for key, name in self._config_mapping.items() + # if key in self._model_mapping.keys() + # ] + # return mapping_keys + list(self._extra_content.keys()) + + # def get(self, key, default): + # try: + # return self.__getitem__(key) + # except KeyError: + # return default + + # def __bool__(self): + # return bool(self.keys()) + + # def values(self): + # mapping_values = [ + # self._load_attr_from_module(key, name) + # for key, name in self._model_mapping.items() + # if key in self._config_mapping.keys() + # ] + # return mapping_values + list(self._extra_content.values()) + + # def items(self): + # mapping_items = [ + # ( + # self._load_attr_from_module(key, self._config_mapping[key]), + # self._load_attr_from_module(key, self._model_mapping[key]), + # ) + # for key in self._model_mapping.keys() + # if key in self._config_mapping.keys() + # ] + # return mapping_items + list(self._extra_content.items()) + + # def __iter__(self): + # return iter(self.keys()) + + # def __contains__(self, item): + # if item in self._extra_content: + # return True + # if not hasattr(item, "__name__") or item.__name__ not in self._reverse_config_mapping: + # return False + # model_type = self._reverse_config_mapping[item.__name__] + # return model_type in self._model_mapping + + # def register(self, key, value): + # """ + # Register a new model in this mapping. + # """ + # if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping: + # model_type = self._reverse_config_mapping[key.__name__] + # if model_type in self._model_mapping.keys(): + # raise ValueError(f"'{key}' is already used by a Transformers model.") + + # self._extra_content[key] = value + + diff --git a/OpenDelta-0.3.2/opendelta/utils/common_structures/__init__.py b/OpenDelta-0.3.2/opendelta/utils/common_structures/__init__.py new file mode 100644 index 0000000..6c98f94 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/common_structures/__init__.py @@ -0,0 +1,24 @@ +CoreMappings = {} + +import importlib +import os +import sys + +cur_path = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, cur_path) + +filelist = os.listdir(cur_path) + +for file in filelist: + if not file.endswith(".py"): + continue + elif file.endswith("__init__.py"): + continue + else: + filename = file[:-3] + mappings = importlib.import_module(f".utils.common_structures.{filename}", "opendelta") + CoreMappings.update(mappings.Mappings) + + + + \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/utils/common_structures/bert.py b/OpenDelta-0.3.2/opendelta/utils/common_structures/bert.py new file mode 100644 index 0000000..b58a255 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/common_structures/bert.py @@ -0,0 +1,28 @@ +Mappings = {} + +Mappings['BertModel'] = { + "embeddings.word_embeddings": {"__name__":"embeddings"}, + "embeddings.position_embeddings": {"__name__":""}, + "embeddings.token_type_embeddings": {"__name__":""}, + "embeddings.LayerNorm": {"__name__":""}, + "encoder": {"__name__":"encoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "attention": {"__name__":"attn", + "self.query": {"__name__":"q"}, + "self.key": {"__name__":"k"}, + "self.value": {"__name__":"v"}, + "output.dense": {"__name__":"proj"}, + "output.LayerNorm": {"__name__":"layer_norm"}, + }, + "output": {"__name__":"ff", + "dense": {"__name__":"w2"}, + "LayerNorm": {"__name__":"layer_norm"} + }, + "intermediate": {"__name__":"ff", + "dense": {"__name__":"w1"}, + } + } + } + }, +} diff --git a/OpenDelta-0.3.2/opendelta/utils/common_structures/debertav2.py b/OpenDelta-0.3.2/opendelta/utils/common_structures/debertav2.py new file mode 100644 index 0000000..727d03b --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/common_structures/debertav2.py @@ -0,0 +1,31 @@ + +Mappings = {} + +Mappings['DebertaV2Model'] = { + "embeddings.word_embeddings": {"__name__":"embeddings"}, + "embeddings.LayerNorm": {"__name__":""}, + "encoder": {"__name__":"encoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "attention": {"__name__":"attn", + "self.query_proj": {"__name__":"q"}, + "self.key_proj": {"__name__":"k"}, + "self.value_proj": {"__name__":"v"}, + "output.dense": {"__name__":"proj"}, + "output.LayerNorm": {"__name__":"layer_norm"}, + }, + "output": {"__name__":"ff", + "dense": {"__name__":"w2"}, + "LayerNorm": {"__name__":"layer_norm"} + }, + "intermediate.dense": {"__name__":"ff.w1"}, + } + }, + "rel_embeddings": {"__name__": ""}, + "LayerNorm": {"__name__": ""}, + "conv": {"__name__": "", + "conv": {"__name__": ""}, + "LayerNorm": {"__name__": ""} + } + }, +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/utils/common_structures/gpt2.py b/OpenDelta-0.3.2/opendelta/utils/common_structures/gpt2.py new file mode 100644 index 0000000..d486187 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/common_structures/gpt2.py @@ -0,0 +1,22 @@ + +Mappings = {} + +Mappings['GPT2Model'] = { + "wte": {"__name__":"embeddings"}, + "wpe": {"__name__":""}, + "h": {"__name__":"decoder.block", + "$": {"__name__":"$", + "attn": {"__name__":"attn", + "c_attn": {"__name__":"q,k,v"}, + "c_proj": {"__name__":"proj"}, + }, + "ln_1": {"__name__":"attn.layer_norm"}, + "mlp":{ "__name__": "ff", + "c_fc": {"__name__":"w1"}, + "c_proj": {"__name__":"w2"} + }, + "ln_2": {"__name__":"ff.layer_norm"}, + }, + }, + "ln_f": {"__name__":"decoder.layer_norm"}, +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/utils/common_structures/opt.py b/OpenDelta-0.3.2/opendelta/utils/common_structures/opt.py new file mode 100644 index 0000000..c1092e7 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/common_structures/opt.py @@ -0,0 +1,25 @@ + + +Mappings = {} +Mappings['OPTModel'] = { + "decoder.embed_tokens": {"__name__":"embeddings"}, + "decoder.embed_positions": {"__name__":""}, + "decoder.project_out": {"__name__":""}, + "decoder.project_in": {"__name__":""}, + "decoder": {"__name__":"decoder", + "layers": {"__name__":"block", + "$": {"__name__":"$", + "self_attn": {"__name__":"attn", + "q_proj": {"__name__":"q"}, + "k_proj": {"__name__":"k"}, + "v_proj": {"__name__":"v"}, + "out_proj": {"__name__":"proj"} + }, + "self_attn_layer_norm": {"__name__":"layer_norm"}, + "fc1": {"__name__":"ff.w1", "__virtual__": "ff", "__order__": "first"}, + "fc2": {"__name__":"ff.w2","__virtual__": "ff", "__order__": "last"}, + "final_layer_norm": {"__name__":"layer_norm"}, + } + } + } +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/utils/common_structures/roberta.py b/OpenDelta-0.3.2/opendelta/utils/common_structures/roberta.py new file mode 100644 index 0000000..94ce813 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/common_structures/roberta.py @@ -0,0 +1,27 @@ +Mappings = {} + +Mappings['RobertaModel'] = {"embeddings.word_embeddings": {"__name__":"embeddings"}, + "embeddings.position_embeddings": {"__name__":""}, + "embeddings.token_type_embeddings": {"__name__":""}, + "embeddings.LayerNorm": {"__name__":""}, + "encoder": {"__name__":"encoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "attention": {"__name__":"attn", + "self.query": {"__name__":"q"}, + "self.key": {"__name__":"k"}, + "self.value": {"__name__":"v"}, + "output.dense": {"__name__":"proj"}, + "output.LayerNorm": {"__name__":"layer_norm"}, + }, + "output": {"__name__":"ff", + "dense": {"__name__":"w2"}, + "LayerNorm": {"__name__":"layer_norm"} + }, + "intermediate": {"__name__":"ff", + "dense": {"__name__":"w1"}, + } + } + } + }, +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/utils/common_structures/t5.py b/OpenDelta-0.3.2/opendelta/utils/common_structures/t5.py new file mode 100644 index 0000000..8150fe2 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/common_structures/t5.py @@ -0,0 +1,71 @@ +Mappings = {} + +t5encoder = {"__name__":"encoder", + "embed_tokens": {"__name__":"embeddings"}, + "block": {"__name__":"block", + "$": {"__name__":"$", + "layer.0": {"__name__":"attn", + "SelfAttention.q": {"__name__":"q"}, + "SelfAttention.k": {"__name__":"k"}, + "SelfAttention.v": {"__name__":"v"}, + "SelfAttention.o": {"__name__":"proj"}, + "SelfAttention.relative_attention_bias": {"__name__":""}, + "layer_norm": {"__name__":"layer_norm"}, + }, + "layer.1": {"__name__":"ff", + "DenseReluDense.wi": {"__name__":"w1"}, + "layer_norm": {"__name__":"layer_norm"}, + "DenseReluDense.wo": {"__name__":"w2"}, + } + } + }, + "final_layer_norm": {"__name__":"layer_norm"}, + } + +t5decoder = {"__name__":"decoder", + "embed_tokens": {"__name__":"embeddings"}, + "block": {"__name__":"block", + "$": {"__name__":"$", + "layer.0": {"__name__":"attn", + "SelfAttention.q": {"__name__":"q"}, + "SelfAttention.k": {"__name__":"k"}, + "SelfAttention.v": {"__name__":"v"}, + "SelfAttention.o": {"__name__":"proj"}, + "SelfAttention.relative_attention_bias": {"__name__":""}, + "layer_norm": {"__name__":"layer_norm"}, + }, + "layer.1": {"__name__":"crossattn", + "EncDecAttention.q": {"__name__":"q"}, + "EncDecAttention.k": {"__name__":"k"}, + "EncDecAttention.v": {"__name__":"v"}, + "EncDecAttention.o": {"__name__":"proj"}, + "layer_norm": {"__name__":"layer_norm"}, + }, + "layer.2": {"__name__":"ff", + "DenseReluDense.wi": {"__name__":"w1"}, + "layer_norm": {"__name__":"layer_norm"}, + "DenseReluDense.wo": {"__name__":"w2"}, + } + } + }, + "final_layer_norm": {"__name__":"layer_norm"}, + } + + + +Mappings['T5Model'] = { + "shared": {"__name__":"embeddings"}, + "encoder": t5encoder, + "decoder": t5decoder, +} + +Mappings['T5ForConditionalGeneration'] = { + "shared": {"__name__":"embeddings"}, + "encoder": t5encoder, + "decoder": t5decoder, +} + +Mappings['T5EncoderModel'] = { + "shared": {"__name__":"embeddings"}, + "encoder": t5encoder, +} \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/utils/cuda.py b/OpenDelta-0.3.2/opendelta/utils/cuda.py new file mode 100644 index 0000000..fcfbc10 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/cuda.py @@ -0,0 +1,81 @@ +from typing import Union +import torch.nn as nn +import torch + +def get_device(module : Union[nn.Module, nn.Parameter]): + if not (isinstance(module, nn.Module) \ + or isinstance(module, nn.Parameter)): + raise RuntimeError("module is not a instance of torch.nn.Module") + if hasattr(module, 'device'): + return module.device + else: + params_devices = [p.device for p in module.parameters()] + if len(params_devices) == 0: + return None + elif len(set(params_devices))==1: + return params_devices[0] + else: + raise RuntimeError("The module is paralleled acrossed device, please get device in a inner module") + +def get_dtype(module : Union[nn.Module, nn.Parameter]): + if not (isinstance(module, nn.Module) \ + or isinstance(module, nn.Parameter)): + raise RuntimeError("module is not a instance of torch.nn.Module") + if hasattr(module, 'dtype'): + return module.dtype + else: + params_dtypes = [p.dtype for p in module.parameters()] + if len(params_dtypes) == 0: + return None + elif len(set(params_dtypes))==1: + return params_dtypes[0] + else: + raise RuntimeError("The module has multiple dtype, please get device in a inner module") + +def move_dict_to_cuda(dict_of_tensor, device): + for key in dict_of_tensor: + if isinstance(dict_of_tensor[key], torch.Tensor): + dict_of_tensor[key] = dict_of_tensor[key].to(device) + return dict_of_tensor + + + +# unitest, should be removed later +if __name__ == "__main__": + import torch + import torch.nn as nn + + a = nn.Parameter(torch.randn(3,5)) + + class MyNet(nn.Module): + def __init__(self): + super().__init__() + + class MyNet2(nn.Module): + def __init__(self): + super().__init__() + self.l1 = nn.Linear(3,5).to('cuda:2') + self.l2 = nn.Linear(3,5).to('cuda:2') + + class MyNet3(nn.Module): + def __init__(self): + super().__init__() + self.l1 = nn.Linear(3,5).to('cuda:3') + self.l2 = nn.Linear(3,5).cuda() + + class MyNet4: + pass + + b = MyNet() + c = MyNet2() + d = MyNet3() + e = MyNet4() + + print(get_device(a)) + print(get_device(b)) + print(get_device(c)) + print(get_device(e)) + print(get_device(d)) + + + diff --git a/OpenDelta-0.3.2/opendelta/utils/data_parallel.py b/OpenDelta-0.3.2/opendelta/utils/data_parallel.py new file mode 100644 index 0000000..8c32297 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/data_parallel.py @@ -0,0 +1,74 @@ +# This utils is used to support Using pytorch's native DataParallel method, +# which create several backbone model inside DataParallel. +# DistributedDataParallel doesn't need this function. +from opendelta.utils.decorate import decorate +from collections import OrderedDict + +def sequential_caller(_org_func, org_module, delta_name, *args, **kwargs): + args = args[1:] # the first argument here is ``self`` + delta_module = getattr(org_module, delta_name) + if hasattr(delta_module, "pre_forward"): + args, kwargs = delta_module.pre_forward(*args, **kwargs) + ret = _org_func(*args, **kwargs) + if hasattr(delta_module, "post_forward"): + ret = delta_module.post_forward(ret) + return ret + +def before_caller(_org_func, org_module, delta_name, *args, **kwargs): + args = args[1:] # the first argument here is ``self`` + delta_module = getattr(org_module, delta_name) + if hasattr(delta_module, "pre_forward"): + args, kwargs = delta_module.pre_forward(*args, **kwargs) + ret = _org_func(*args, **kwargs) + return ret + +def after_caller(_org_func, org_module, delta_name, *args, **kwargs): + args = args[1:] # the first argument here is ``self`` + delta_module = getattr(org_module, delta_name) + ret = _org_func(*args, **kwargs) + if hasattr(delta_module, "post_forward"): + ret = delta_module.post_forward(ret) + return ret + +def parallel_caller(_org_func, org_module, delta_name, *args, **kwargs): + args = args[1:] # the first argument here is ``self`` + delta_module = getattr(org_module, delta_name) + ret_1 = _org_func(*args, **kwargs) + ret_2 = delta_module.forward(*args, **kwargs) + return ret_1 + ret_2 + +caller_map = { + "sequential": sequential_caller, + "parallel": parallel_caller, + "before": before_caller, + "after": after_caller, +} + +def new_replicate_for_data_parallel(self): + r""" self is the parent module. + """ + # rewrite the replicate in DataParallel. + replica = self.__new__(type(self)) + org_forward = replica.forward + replica.__dict__ = self.__dict__.copy() + assert replica.forward != org_forward + replica.__dict__['forward'] = org_forward + + + for _delta_info in self._delta_infos: + if _delta_info['state'] == 'on': + if _delta_info['method'] in caller_map.keys(): + caller = caller_map[_delta_info['method']] + new_forward = decorate(replica.forward, caller, extras=(replica, _delta_info['delta_name']), kwsyntax=True) + else: + raise NotImplementedError(f"data_parallel for _delta_info['method']=='{_delta_info['method']}' is not supported") + replica.__dict__['forward'] = new_forward.__get__(replica, type(replica)) + + # replicas do not have parameters themselves, the replicas reference the original + # module. + replica._parameters = OrderedDict() + replica._buffers = replica._buffers.copy() + replica._modules = replica._modules.copy() + replica._is_replica = True + + return replica \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/utils/decorate.py b/OpenDelta-0.3.2/opendelta/utils/decorate.py new file mode 100644 index 0000000..d8782e5 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/decorate.py @@ -0,0 +1,75 @@ +# copied and modified from decorator.decorate + +import re +import sys +import inspect +import operator +import itertools +from contextlib import _GeneratorContextManager +from inspect import getfullargspec, iscoroutinefunction, isgeneratorfunction + +def fix(args, kwargs, sig): + """ + Fix args and kwargs to be consistent with the signature + """ + ba = sig.bind(*args, **kwargs) + ba.apply_defaults() # needed for test_dan_schult + return ba.args, ba.kwargs + + +def decorate(func, caller, extras=(), kwsyntax=False): + """ + Decorates a function/generator/coroutine using a caller. + If kwsyntax is True calling the decorated functions with keyword + syntax will pass the named arguments inside the ``kw`` dictionary, + even if such argument are positional, similarly to what functools.wraps + does. By default kwsyntax is False and the the arguments are untouched. + + **The difference between this function and decorator.decorate is that + is support nested decorate. + """ + sig = inspect.signature(func) + if iscoroutinefunction(caller): + async def fun(*args, **kw): + if not kwsyntax: + args, kw = fix(args, kw, sig) + return await caller(func, *(extras + args), **kw) + elif isgeneratorfunction(caller): + def fun(*args, **kw): + if not kwsyntax: + args, kw = fix(args, kw, sig) + for res in caller(func, *(extras + args), **kw): + yield res + else: + def fun(*args, **kw): + if not kwsyntax: + args, kw = fix(args, kw, sig) + return caller(func, *(extras + args), **kw) + fun.__name__ = func.__name__ + fun.__doc__ = func.__doc__ + __wrapped__ = func # support nested wrap + fun.__signature__ = sig + fun.__qualname__ = func.__qualname__ + # builtin functions like defaultdict.__setitem__ lack many attributes + try: + fun.__defaults__ = func.__defaults__ + except AttributeError: + pass + try: + fun.__kwdefaults__ = func.__kwdefaults__ + except AttributeError: + pass + try: + fun.__annotations__ = func.__annotations__ + except AttributeError: + pass + try: + fun.__module__ = func.__module__ + except AttributeError: + pass + try: + fun.__dict__.update(func.__dict__) + except AttributeError: + pass + fun.__wrapped__ = __wrapped__ # support nested wrap + return fun \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/utils/delta_center.py b/OpenDelta-0.3.2/opendelta/utils/delta_center.py new file mode 100644 index 0000000..9bf185d --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/delta_center.py @@ -0,0 +1,10 @@ +from DeltaCenter import OssClient +from .file_utils import default_cache_path + + +def download(finetuned_delta_path, cache_dir=None, force_download=False): + if cache_dir is None: + cache_dir = default_cache_path + path_to_unzip_file = OssClient.download(finetuned_delta_path, dest=cache_dir, force_download=force_download) + return path_to_unzip_file + diff --git a/OpenDelta-0.3.2/opendelta/utils/delta_hub.py b/OpenDelta-0.3.2/opendelta/utils/delta_hub.py new file mode 100644 index 0000000..504fc54 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/delta_hub.py @@ -0,0 +1,29 @@ + + +def create_hub_repo_name(root = "DeltaHub", + dataset = None, + delta_type = None, + model_name_or_path = None, + center_value_only_tags = None, + center_key_value_tags = None + ): + r"""Currently, it's only a simple concatenation of the arguments. + """ + repo_name = [] + + repo_name.append(f"{delta_type}") + model_name_or_path = model_name_or_path.split("/")[-1] + repo_name.append(f"{model_name_or_path}") + repo_name.append(f"{dataset}") + + repo_name.extend(list(center_value_only_tags) if center_value_only_tags else [None]) + repo_name.extend([f"{k}-{v}" for k,v in center_key_value_tags.items()] if center_key_value_tags else [None]) + + repo_name = "_".join(repo_name) + + repo_name = root+"/"+repo_name + return repo_name + + + + diff --git a/OpenDelta-0.3.2/opendelta/utils/file_utils.py b/OpenDelta-0.3.2/opendelta/utils/file_utils.py new file mode 100644 index 0000000..2e82768 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/file_utils.py @@ -0,0 +1,3 @@ +import os +default_cache_path = "{}/.cache/delta_center/".format(os.path.expanduser('~')) +WEIGHTS_NAME = 'pytorch_model.bin' \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/utils/inspect.py b/OpenDelta-0.3.2/opendelta/utils/inspect.py new file mode 100644 index 0000000..830298e --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/inspect.py @@ -0,0 +1,112 @@ + +import torch +import torch.nn as nn +from typing import Optional +import opendelta.utils.logging as logging + +logger = logging.get_logger(__name__) + + +def inspect_module_statistics(module: Optional[nn.Module]=None, verbose=True): + r"""Get the statistics of the parameters in the delta modules. + + Args: + module (:obj:`nn.Module`, *optional*): The module to compute the statistics. + + Returns: + :obj:`dict`: The statistics of the parameters in the delta modules. + + """ + + stat = {} + n_trainable = num_trainable_parameters(module) + n_total = num_total_parameters(module) + + stat['total_parameters'] = n_total + stat['trainable_parameters'] = n_trainable + + stat['trainable_ratio'] = n_trainable/n_total + + n_delta = num_delta_parameters(module) + n_total = num_total_parameters(module) + stat['delta_parameters'] = n_delta + stat['delta_ratio'] = n_delta/n_total + + cudamem = 0 + maxcudamem = 0 + for device_id in range(torch.cuda.device_count()): + cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3 + maxcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3 + stat['cudamem'] = cudamem + stat['maxcudamem'] = maxcudamem + + if verbose: + logger.info(stat) + + return stat + +def num_trainable_parameters(module: Optional[nn.Module]=None): + r"""[NODOC] A small sugar function to get the number of trainable parameter in the backbone model. Often used to + compute the trainable rate. + + Args: + module (:obj:`nn.Module`): of which module we want to know the number of trainable paramemters. + + Returns: + :obj:`List[nn.Parameter]` + """ + pnum_tot = 0 + for param in module.parameters(): + if param.requires_grad: + pnum_tot += param.numel() + return pnum_tot + + +def num_total_parameters(module: Optional[nn.Module]=None): + r"""[NODOC] A small sugar function to get the number of trainable parameter in the backbone model. Often used to + compute the trainable rate. + + Args: + module (:obj:`nn.Module`): of which module we want to know the number of trainable paramemters. + + Returns: + :obj:`List[nn.Parameter]` + """ + pnum_tot = 0 + for param in module.parameters(): + pnum_tot += param.numel() + return pnum_tot + +def num_delta_parameters(module: Optional[nn.Module]=None): + r"""[NODOC] A small sugar function to get the number of trainable parameter in the backbone model. Often used to + compute the trainable rate. + + Args: + module (:obj:`nn.Module`): of which module we want to know the number of trainable paramemters. + + Returns: + :obj:`List[nn.Parameter]` + """ + pnum_tot = 0 + for param in module.parameters(): + if hasattr(param, "_is_delta"): + pnum_tot += param.numel() + return pnum_tot + +def inspect_optimizer_statistics(optimizer, verbose=True): + stats = {} + for id, param_group in enumerate(optimizer.param_groups): + stat = {} + fine_grain_info = [(p.numel(), p.requires_grad) for p in param_group['params']] + stat['total_parameters'] = sum(n for n, r in fine_grain_info) + stat['trainable_parameters'] = sum(n for n, r in fine_grain_info if r) + stat['trainable_ratio'] = "{:.6f}%".format(stat['trainable_parameters']/stat['total_parameters']*100) + for key in param_group: + if key != 'params': + stat[key] = param_group[key] + stats[f'param_group_{id}'] = stat + + if verbose: + logger.info(f"optimizer info: {stats}") + + return stat diff --git a/OpenDelta-0.3.2/opendelta/utils/interactive/__init__.py b/OpenDelta-0.3.2/opendelta/utils/interactive/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/OpenDelta-0.3.2/opendelta/utils/interactive/templates/index.html b/OpenDelta-0.3.2/opendelta/utils/interactive/templates/index.html new file mode 100644 index 0000000..c6235f8 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/interactive/templates/index.html @@ -0,0 +1,176 @@ +$def with (content) + + + + + + + + + + +$:content + + + + + diff --git a/OpenDelta-0.3.2/opendelta/utils/interactive/web.py b/OpenDelta-0.3.2/opendelta/utils/interactive/web.py new file mode 100644 index 0000000..d7b5627 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/interactive/web.py @@ -0,0 +1,127 @@ +from bigmodelvis import Visualization +import web +import re, os + +space = " " +prefix0 = space * 9 +prefix1 = f"│"+space*5 +prefix2 = f"├─{space}" +prefix3 = f"└─{space}" + +def colorfy(label): + i = 0 + res = "" + while i < len(label): + if label[i] == '[': + color = "" + i += 1 + while label[i] != ']': + color += label[i] + i += 1 + i += 1 + if color[0].isdigit(): # dims but not color + res += f'[{color}]' + else: + if res != "": res += '' + res += f'' + else: + res += label[i] + i += 1 + res += '' + return res + +compressed_pattern_1 = re.compile("[0-9]+-[0-9]+") +compressed_pattern_2 = re.compile(".+(,.+)+") + +def expand_part(part): + res = [] + if compressed_pattern_1.fullmatch(part): + st, ed = map(int, part.split('-')) + for i in range(st, ed+1): + res.append( str(i) ) + elif compressed_pattern_2.fullmatch(part): + for c in part.split(','): + res.append( c ) + else: + res.append( part ) + return res + +def dfs(o, depth, last, old_name): + html = "" + module_names = expand_part(o.module_name) + if depth > 0: + old_last_1 = last[-1] + if len(module_names) > 1: + module_names = [o.module_name] + module_names + for ith, module_name in enumerate(module_names): + if ith == 0: + html += f'
' + elif ith == 1: + html += f'
' + + for i in range(depth-1): + html += prefix0 if last[i] else prefix1 + if depth > 0: + last[-1] = old_last_1 & (ith == 0 or ith == len(module_names)-1) + html += prefix3 if last[-1] else prefix2 + length = len(o.children) + if length > 0: + html += f'' + name = old_name + module_name + if ith > 0: + label = f'[red]{module_name}{o.label[o.label.index("[",1):]}' + else: + label = o.label + html += f'' + if len(module_names) > 1 and ith == 0: + html += '' + html += '
' + html += f'
' + for i, child in enumerate(o.children): + last = last + [i == length-1] + html += dfs(child, depth+1, last, name + ".") + last.pop() + + html += "
" + if ith == 0 or (ith > 1 and ith == len(module_names)-1): + html += "
" + return html + +urls = ( + '/submit/(.*)', 'submit', + '/(.*)', 'hello', +) + +class PortApplication(web.application): + def run(self, port=8080, *middleware): + func = self.wsgifunc(*middleware) + return web.httpserver.runsimple(func, ('0.0.0.0', port)) + +app = PortApplication(urls, globals()) +render = web.template.render(os.path.join(os.path.dirname(__file__), 'templates/')) +names = [] + +class hello: + def GET(self, name): + return render.index(content=html) +class submit: + def GET(self, _): + global names + names = [name[5:] for name in web.input(name=[]).name] + app.stop() + +def interactive(model, port=8888): + tree = Visualization(model).structure_graph(printTree=False) + + global html + html = dfs(tree, 0, [], "") + + print() + print("If on your machine, open the link below for interactive modification.\n " + "If on remote host, you could use port mapping, " + "or run in vscode terminal, which automatically do port mapping for you.") + app.run(port) + global names + print("modified_modules:") + print(names) + return names diff --git a/OpenDelta-0.3.2/opendelta/utils/logging.py b/OpenDelta-0.3.2/opendelta/utils/logging.py new file mode 100644 index 0000000..2211c6f --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/logging.py @@ -0,0 +1,279 @@ +# coding=utf-8 +# Copyright 2020 Optuna, Hugging Face +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# OpenDelta copied from Huggingface Transformers +""" Logging utilities.""" + +import logging +import os +import sys +import threading +from logging import CRITICAL # NOQA +from logging import DEBUG # NOQA +from logging import ERROR # NOQA +from logging import FATAL # NOQA +from logging import INFO # NOQA +from logging import NOTSET # NOQA +from logging import WARN # NOQA +from logging import WARNING # NOQA +from typing import Optional + + +_lock = threading.Lock() +_default_handler: Optional[logging.Handler] = None + +log_levels = { + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + "critical": logging.CRITICAL, +} + +_default_log_level = logging.INFO + + +def _get_default_logging_level(): + """ + If TRANSFORMERS_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is + not - fall back to ``_default_log_level`` + """ + env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None) + if env_level_str: + if env_level_str in log_levels: + return log_levels[env_level_str] + else: + logging.getLogger().warning( + f"Unknown option TRANSFORMERS_VERBOSITY={env_level_str}, " + f"has to be one of: { ', '.join(log_levels.keys()) }" + ) + return _default_log_level + + +def _get_library_name() -> str: + + return __name__.split(".")[0] + + +def _get_library_root_logger() -> logging.Logger: + + return logging.getLogger(_get_library_name()) + + +def _configure_library_root_logger() -> None: + + global _default_handler + + with _lock: + if _default_handler: + # This library has already configured the library root logger. + return + _default_handler = logging.StreamHandler() # Set sys.stderr as stream. + _default_handler.flush = sys.stderr.flush + formatter = logging.Formatter( + "[%(levelname)s|(OpenDelta)%(module)s:%(lineno)d]%(asctime)s >> %(message)s") + _default_handler.setFormatter(formatter) + + # Apply our default configuration to the library root logger. + library_root_logger = _get_library_root_logger() + library_root_logger.addHandler(_default_handler) + library_root_logger.setLevel(_get_default_logging_level()) + + + library_root_logger.propagate = False + + +def _reset_library_root_logger() -> None: + + global _default_handler + + with _lock: + if not _default_handler: + return + + library_root_logger = _get_library_root_logger() + library_root_logger.removeHandler(_default_handler) + library_root_logger.setLevel(logging.NOTSET) + _default_handler = None + + +def get_log_levels_dict(): + return log_levels + + + +def get_verbosity() -> int: + """ + Return the current level for the 🤗 Transformers's root logger as an int. + Returns: + :obj:`int`: The logging level. + + 🤗 Transformers has following logging levels: + - 50: ``transformers.logging.CRITICAL`` or ``transformers.logging.FATAL`` + - 40: ``transformers.logging.ERROR`` + - 30: ``transformers.logging.WARNING`` or ``transformers.logging.WARN`` + - 20: ``transformers.logging.INFO`` + - 10: ``transformers.logging.DEBUG`` + """ + + _configure_library_root_logger() + return _get_library_root_logger().getEffectiveLevel() + + +def set_verbosity(verbosity: int) -> None: + """ + Set the verbosity level for the 🤗 Transformers's root logger. + Args: + verbosity (:obj:`int`): + Logging level, e.g., one of: + - ``transformers.logging.CRITICAL`` or ``transformers.logging.FATAL`` + - ``transformers.logging.ERROR`` + - ``transformers.logging.WARNING`` or ``transformers.logging.WARN`` + - ``transformers.logging.INFO`` + - ``transformers.logging.DEBUG`` + """ + + _configure_library_root_logger() + _get_library_root_logger().setLevel(verbosity) + + +def set_verbosity_info(): + """Set the verbosity to the ``INFO`` level.""" + return set_verbosity(INFO) + + +def set_verbosity_warning(): + """Set the verbosity to the ``WARNING`` level.""" + return set_verbosity(WARNING) + + +def set_verbosity_debug(): + """Set the verbosity to the ``DEBUG`` level.""" + return set_verbosity(DEBUG) + + +def set_verbosity_error(): + """Set the verbosity to the ``ERROR`` level.""" + return set_verbosity(ERROR) + + +def disable_default_handler() -> None: + """Disable the default handler of the HuggingFace Transformers's root logger.""" + + _configure_library_root_logger() + + assert _default_handler is not None + _get_library_root_logger().removeHandler(_default_handler) + + +def enable_default_handler() -> None: + """Enable the default handler of the HuggingFace Transformers's root logger.""" + + _configure_library_root_logger() + + assert _default_handler is not None + _get_library_root_logger().addHandler(_default_handler) + + +def add_handler(handler: logging.Handler) -> None: + """adds a handler to the HuggingFace Transformers's root logger.""" + + _configure_library_root_logger() + + assert handler is not None + _get_library_root_logger().addHandler(handler) + + +def remove_handler(handler: logging.Handler) -> None: + """removes given handler from the HuggingFace Transformers's root logger.""" + + _configure_library_root_logger() + + assert handler is not None and handler not in _get_library_root_logger().handlers + _get_library_root_logger().removeHandler(handler) + + +def disable_propagation() -> None: + """ + Disable propagation of the library log outputs. Note that log propagation is disabled by default. + """ + + _configure_library_root_logger() + _get_library_root_logger().propagate = False + + +def enable_propagation() -> None: + """ + Enable propagation of the library log outputs. Please disable the HuggingFace Transformers's default handler to + prevent double logging if the root logger has been configured. + """ + + _configure_library_root_logger() + _get_library_root_logger().propagate = True + + +def enable_explicit_format() -> None: + """ + Enable explicit formatting for every HuggingFace Transformers's logger. The explicit formatter is as follows: + ``` + [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE + ``` + All handlers currently bound to the root logger are affected by this method. + """ + handlers = _get_library_root_logger().handlers + + for handler in handlers: + formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s") + handler.setFormatter(formatter) + + +def reset_format() -> None: + """ + Resets the formatting for HuggingFace Transformers's loggers. + All handlers currently bound to the root logger are affected by this method. + """ + handlers = _get_library_root_logger().handlers + + for handler in handlers: + handler.setFormatter(None) + + +def warning_advice(self, *args, **kwargs): + """ + This method is identical to ``logger.warning()``, but if env var TRANSFORMERS_NO_ADVISORY_WARNINGS=1 is set, this + warning will not be printed + """ + no_advisory_warnings = os.getenv("TRANSFORMERS_NO_ADVISORY_WARNINGS", False) + if no_advisory_warnings: + return + self.warning(*args, **kwargs) + + +logging.Logger.warning_advice = warning_advice + + +def get_logger(name: Optional[str] = None, verbosity='info') -> logging.Logger: + """ + Return a logger with the specified name. + This function is not supposed to be directly accessed unless you are writing a custom transformers module. + """ + + if name is None: + name = _get_library_name() + + _configure_library_root_logger() + logger = logging.getLogger(name) + logger.setLevel(log_levels[verbosity]) + return logger diff --git a/OpenDelta-0.3.2/opendelta/utils/model_md5.py b/OpenDelta-0.3.2/opendelta/utils/model_md5.py new file mode 100644 index 0000000..3666295 --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/model_md5.py @@ -0,0 +1,36 @@ +import hashlib + +def gen_model_hash(model, with_parameters=True): + r"""Get model hash (structure and parameter) + """ + str_model_structure = str(model).replace("\n","").replace(" ","").replace("\t","").encode('utf-8') + md5 = hashlib.md5(str_model_structure) + + if with_parameters: + md5 = gen_parameter_hash(model.parameters(), md5=md5) + + md5_code = md5.hexdigest() + return md5_code + + + +def gen_parameter_hash(generator, md5=None): + r"""Get parameter hash. From https://zhuanlan.zhihu.com/p/392942816 + + """ + if md5 is None: + md5 = hashlib.md5() + for arg in generator: + x = arg.data + if hasattr(x, "cpu"): + md5.update(x.cpu().numpy().data.tobytes()) + elif hasattr(x, "numpy"): + md5.update(x.numpy().data.tobytes()) + elif hasattr(x, "data"): + md5.update(x.data.tobytes()) + else: + try: + md5.update(x.encode("utf-8")) + except: + md5.update(str(x).encode("utf-8")) + return md5 \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/utils/name_based_addressing.py b/OpenDelta-0.3.2/opendelta/utils/name_based_addressing.py new file mode 100644 index 0000000..e0d569f --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/name_based_addressing.py @@ -0,0 +1,46 @@ +from typing import List, Union +import re +def superstring_in(str_a: str , list_b: List[str]): + r"""check whether there is any string in list b containing str_a. + + Args: + Returns: + """ + return any(str_a in str_b for str_b in list_b) + +def is_child_key(str_a: str , list_b: List[str]): + r"""check whether a string in ``list_b`` is the child key in ``str_a`` + + Args: + Returns: + """ + return any(str_b in str_a and (str_b==str_a or str_a[len(str_b)]==".") for str_b in list_b) + +def endswith_in(str_a: str, list_b: List[str]): + return endswith_in_regex(str_a, [b[3:] for b in list_b if b.startswith("[r]")]) or \ + endswith_in_normal(str_a, [b for b in list_b if not b.startswith("[r]")]) + +def endswith_in_normal(str_a: str , list_b: List[str]): + r"""check whether ``str_a`` has a substring that is in list_b. + + Args: + Returns: + """ + return any(str_a.endswith(str_b) and (str_a==str_b or str_a[-len(str_b)-1] == ".") for str_b in list_b) + +def endswith_in_regex(str_a: str , list_b: List[str]): + r"""check whether ``str_a`` has a substring that is in list_b. + + Args: + Returns: + """ + for str_b in list_b: + ret = re.search(re.compile(str_b), str_a) + if ret is not None: + b = ret.group() + if ret.span()[1] == len(str_a) and (b == str_a or (str_a==b or str_a[-len(b)-1] == ".")): + # the latter is to judge whether it is a full sub key in the str_a, e.g. str_a=`attn.c_attn` and list_b=[`attn`] will given False + return True + return False + + \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/utils/saving_loading_utils.py b/OpenDelta-0.3.2/opendelta/utils/saving_loading_utils.py new file mode 100644 index 0000000..4b9b92e --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/saving_loading_utils.py @@ -0,0 +1,434 @@ + +from typing import Dict, List, Union, Optional, Callable +from opendelta.delta_configs import BaseDeltaConfig +from opendelta.utils.model_md5 import gen_model_hash, gen_parameter_hash +import torch +import os +from opendelta import logging +import torch.nn as nn +from DeltaCenter import OssClient +import yaml +from dataclasses import dataclass, field, fields +import datetime +from .file_utils import WEIGHTS_NAME + +logger = logging.get_logger(__name__) + + + +alternative_names = { + "train_tasks": ["train_tasks", "train_task", "task_name"], +} + + +@dataclass +class DeltaCenterArguments: + """ + The arguments that are used to distinguish between different delta models on the DeltaCenter + """ + name: str = field(default="", + metadata={"help": "The name of the delta model checkpoint"} + ) + backbone_model: str = field(default="", + metadata={"help": "The backbone model of the delta model"} + ) + backbone_model_path_public: str = field( + default = None, + metadata={"help": "Publicly available path (url) to pretrained model or model identifier from huggingface.co/models"} + ) + delta_type: str = field( + default=None, + metadata={"help": "the type of type model, e.g., adapter, lora, etc."} + ) + train_tasks: Optional[Union[List[str], str]]= field( + default=None, + metadata={"help": "the task(s) that the delta is trained on"} + ) + train_datasets: Optional[Union[List[str], str]]= field( + default=None, + metadata={"help": "the datasets(s) that the delta is trained on"} + ) + checkpoint_size: Optional[float] = field( + default=None, + metadata={"help": "the size of the checkpoint, in MB"} + ) + test_tasks: Optional[Union[List[str], str]] = field( + default=None, + metadata={"help": "the task(s) that the delta is tested on"} + ) + test_datasets: Optional[Union[List[str], str]] = field( + default=None, + metadata={"help": "the dataset(s) that the delta is tested on"} + ) + test_performance: Optional[float] = field( + default=None, + metadata={"help": "the performance of the model on the test set"} + ) + test_metrics: Optional[str] = field( + default=None, + metadata={"help": "the metrics used by the model"} + ) + trainable_ratio: Optional[float] = field( + default=None, + metadata={"help": "the ratio of trainable parameters in the model"} + ) + delta_ratio: Optional[float] = field( + default=None, + metadata={"help": "the ratio of delta parameters in the model"} + ) + usage: Optional[str] = field( + default="", + metadata={"help": "the usage code of the model"} + ) + license: Optional[str] = field( + default="apache-2.0", + metadata={"help": "the license of the model"} + ) + + + +class SaveLoadMixin: + def add_configs_when_saving(self,): + self.config.backbone_class = self.backbone_model.__class__.__name__ + if hasattr(self.backbone_model, "config"): + self.config.backbone_checkpoint_name = os.path.split(self.backbone_model.config._name_or_path.strip("/"))[-1] + self.config.backbone_hash = gen_model_hash(self.backbone_model) + + + + def save_finetuned( + self, + finetuned_delta_path: Optional[Union[str, os.PathLike]] = "./delta_checkpoints/", + save_config: bool = True, + state_dict: Optional[dict] = None, + save_function: Callable = torch.save, + push_to_dc: bool = False, + center_args: Optional[Union[DeltaCenterArguments, dict]] = dict(), + center_args_pool: Optional[dict] = dict(), + list_tags: Optional[List] = list(), + dict_tags: Optional[Dict] = dict(), + delay_push: bool = False, + test_result = None, + usage: Optional[str] = "", + ): + r""" + Save a model and its configuration file to a directory, so that it can be re-loaded using the + :py:meth:`~DeltaBase.save_finetuned` class method. + + Arguments: + finetuned_delta_path: (optional) path to the directory where the model and its configuration file will be saved. + If not specified, the model will be saved in the directory ``./delta_checkpoints/``, + which is a subdirectory of the current working directory. + save_config: (optional) if ``True``, the configuration file will be saved in the same directory as the + model file. if ``False``, only the state dict will be saved. + state_dict: (optional) a dictionary containing the model's state_dict. If not specified, the + state_dict is loaded from the backbone model's trainable parameters. + save_function: (optional) the function used to save the model. Defaults to ``torch.save``. + state_dict_only: (optional) if ``True``, only the state_dict will be saved. + push_to_dc: (optional) if ``True``, the model will prepare things to pushed to the DeltaCenter. + This includes: + - creating a configuration file for the model + - creating a directory for the model + - saving the model's trainable parameters + - pushing the model to the DeltaCenter + center_args: (optional) the arguments that are used to distinguish between different delta models on the DeltaCenter + center_args_pool: (optional) a dictionary containing the arguments that are used to distinguish between different delta models on the DeltaCenter + list_tags: (optional) a list of tags that will be added to the model's configuration file + dict_tags: (optional) a dictionary of tags that will be added to the model's configuration file + delay_push: (optional) if ``True``, the model will not be pushed to the DeltaCenter. This is useful if you want to + push the model later. + + """ + + # create the config to save, including model hash, etc. + if save_config: + if not hasattr(self, "config"): + self.create_config_from_model() + self.add_configs_when_saving() + + if push_to_dc: + final_center_args = self.create_delta_center_args(center_args=center_args, + center_args_pool=center_args_pool) + + save_directory = finetuned_delta_path + if os.path.isfile(save_directory): + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") + return + + os.makedirs(save_directory, exist_ok=True) + + if push_to_dc: + save_directory = os.path.join(save_directory, final_center_args.name) + os.makedirs(save_directory, exist_ok=True) + + model_to_save = self.backbone_model# unwrap_model(self) + + # Save the model + if state_dict is None: + state_dict = model_to_save.state_dict() + + output_model_file = os.path.join(save_directory, WEIGHTS_NAME) + save_function(state_dict, output_model_file) + + # Save the config + if save_config: + self.config.save_finetuned(save_directory) + + + + + + + logger.info("\n"+"*"*30+f"\nYou delta models has been saved locally to:\t{os.path.abspath(save_directory)}" + ) + self.compute_saving(output_model_file) + + state_dict_total_params = sum(p.numel() for p in state_dict.values()) + other_tags={} + other_tags.update({'state_dict_total_params(M)':state_dict_total_params/1024/1024}) + other_tags.update({'test_result':test_result}) + if push_to_dc: + logger.info("Creating yaml file for delta center") + self.create_yml(save_directory, final_center_args, list_tags, dict_tags, other_tags) + + if not delay_push: + OssClient.upload(base_dir=save_directory) + else: + logger.info(f"Delay push: you can push it to the delta center later using \n\tpython -m DeltaCenter upload {os.path.abspath(save_directory)}\n" + +"*"*30) + else: + logger.info("We encourage users to push their final and public models to delta center to share them with the community!") + + def compute_saving(self, output_model_file): + import os + stats = os.stat(output_model_file) + if stats.st_size > (1024**3): + unit = 'GB' + value = stats.st_size/(1024**3) + else: + unit = 'MB' + value = stats.st_size/(1024**2) + logger.info("The state dict size is {:.3f} {}".format(value, unit)) + + + + + def create_yml(self, save_dir, config, list_tags=list(), dict_tags=dict(),other_tags=None): + f = open("{}/config.yml".format(save_dir), 'w') + config_dict = vars(config) + config_dict['dict_tags'] = dict_tags + config_dict['list_tags'] = list_tags + if other_tags is not None: + config_dict.update(other_tags) + yaml.safe_dump(config_dict, f) + f.close() + + def load_checkpoint(self, path, load_func=torch.load, backbone_model=None): + r"""Simple method for loading only the checkpoint + """ + if backbone_model is None: + backbone_model = self.backbone_model + self.backbone_model.load_state_dict(load_func(f"{path}/{WEIGHTS_NAME}"), strict=False) + + def save_checkpoint(self, path, save_func=torch.save, backbone_model=None): + r"""Simple method for saving only the checkpoint""" + if backbone_model is None: + backbone_model = self.backbone_model + save_func(backbone_model.state_dict(), f"{path}/{WEIGHTS_NAME}") + + @classmethod + def from_finetuned(cls, + finetuned_delta_path: Optional[Union[str, os.PathLike]], + backbone_model: nn.Module, + delta_config = None, + cache_dir: Optional[Union[str, os.PathLike]] = None, + state_dict: Optional[dict] = None, + *model_args, + force_download: Optional[bool] = False, + check_hash: Optional[bool] = True, + local_files_only: Optional[bool] = False, + **kwargs): + r""" + Instantiate a finetuned delta model from a path. + The backbone_model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated). + To further train the model, you can use the :meth:`freeze_module ` method. + + Parameters: + finetuned_delta_path: (optional) path to the directory where the model and its configuration file will be saved. + If not specified, the model will be loaded from the directory cahce directory. (see ``cache_dir``), + backbone_model: the backbone model that will be used to instantiate the finetuned delta model. + delta_config: (optional) the configuration file of the finetuned delta model. If not specified, the configuration file + is loaded from the directory ``finetuned_delta_path``. + cache_dir: (optional) path to the directory where the model and its configuration file will be saved. + If not specified, we will first look into current working directory, then the cache directory of your system, e.g., ~/.cache/delta_center/, + state_dict: (optional) a dictionary containing the model's state_dict. If not specified, the + state_dict is loaded from the ``finetuned_delta_path``. + force_download: (optional) if ``True``, the model will be downloaded from the internet even if it is already + present in the cache directory. + check_hash: (optional) if ``True``, check whether the hash of the model once it's trained differs from what we load now. + local_files_only: (optional) if ``True``, the model will be loaded from the local cache directory. + """ + + if os.environ.get("DELTACENTER_OFFLINE", '0') == '1': + logger.info("Delta Center offline mode!") + local_files_only = True + + # Load config if we don't provide a configuration + + + finetuned_delta_path = str(finetuned_delta_path) + + if cache_dir is not None: + cached_finetuned_delta_path = os.path.join(cache_dir, finetuned_delta_path) + else: + cached_finetuned_delta_path = finetuned_delta_path + + download_from_dc = False + if os.path.isfile(cached_finetuned_delta_path): + raise RuntimeError( + f"You should pass a directory to load a delta checkpoint instead of a file, " + f"since we need the delta's configuration file." + ) + elif os.path.isdir(cached_finetuned_delta_path): + if os.path.isfile(os.path.join(cached_finetuned_delta_path, WEIGHTS_NAME)): + # Load from a PyTorch checkpoint + weight_file = os.path.join(cached_finetuned_delta_path, WEIGHTS_NAME) + else: + raise EnvironmentError( + f"Error no file named {WEIGHTS_NAME} found in " + f"directory {cached_finetuned_delta_path}." + ) + + else: + # try to download from DeltaCenter + from .delta_center import download as dcdownload + cached_finetuned_delta_path = dcdownload(finetuned_delta_path, cache_dir=cache_dir, force_download=force_download) + download_from_dc = True + weight_file = os.path.join(cached_finetuned_delta_path, WEIGHTS_NAME) + + if state_dict is None: + state_dict = torch.load(weight_file, map_location="cpu") + + if not isinstance(delta_config, BaseDeltaConfig): + delta_config, model_kwargs = cls.config_class.from_finetuned( + cached_finetuned_delta_path, + cache_dir=None, + return_unused_kwargs=True, + local_files_only=True if download_from_dc else local_files_only, # has been downloaded + **kwargs, + ) + + else: + model_kwargs = kwargs + + + # Initialize the model from config and attach the delta model to the backbone_model. + delta_model = cls.from_config(delta_config, backbone_model, *model_args, **model_kwargs, ) + + # load the state_dict into the backbone_model. As the delta model's parameter + # is the same object as the deltas in the backbone model with different reference name, + # the state_dict will also be loaded into the delta model. + delta_model._load_state_dict_into_backbone(backbone_model, state_dict) + + backbone_hash = gen_model_hash(backbone_model) + + if check_hash: + if hasattr(delta_config, "backbone_hash") and \ + delta_config.backbone_hash is not None and \ + delta_config.backbone_hash != backbone_hash: + logger.warning("The config has an hash of the backbone model, and is" + "different from the hash of the loaded model. This indicates a mismatch" + "between the backbone model that the delta checkpoint is based on and" + "the one you loaded. You propobability need to Train the model instead of" + "directly inference. ") + else: + logger.info("Hash-check passed. You can safely use this checkpoint directly.") + else: + logger.warning("Parameters' hash has not been checked!") + + + # Set model in evaluation mode to deactivate DropOut modules by default + backbone_model.eval() + + return delta_model + + + def create_delta_center_args(self, center_args, center_args_pool): + """ + Create the delta center args for the center model. + center_args has higher priority than center_args_pool. + + """ + mdict = {} + field = fields(DeltaCenterArguments) + + + for f in field: + exist = False + # first is center_args, exact match + if f.name in center_args: + mdict[f.name] = center_args[f.name] + continue + # second is center_args_pool, can use alternative names + if f.name in center_args_pool: + mdict[f.name] = center_args_pool[f.name] + exist = True + elif f.name in alternative_names: + for altername in alternative_names[f.name]: + if altername in center_args_pool: + mdict[f.name] = center_args_pool[altername] + exist = True + break + # if not exist, find from self.stat or set to default + if not exist: + if f.name in self.stat: + mdict[f.name] = self.stat[f.name] + else: + mdict[f.name] = f.default + + # if eventualy name is not set, create a default one + if mdict['name'] is None or mdict['name'] == '': + logger.info("Name is not set, use default name.") + mdict['name'] = self.create_default_name(**mdict) + + if len(mdict['usage']) == 0: + logger.info("Usage is not set, use default usage.") + mdict['usage'] = self.create_default_usage(mdict['name']) + + + center_args = DeltaCenterArguments(**mdict) + return center_args + + def create_default_usage(self, name): + usage_str = """from opendelta import AutoDeltaModel\n""" + \ + """delta_model = AutoDeltaModel.from_finetuned('{name_with_userid}', backbone_model=model)\n""" + \ + """delta_model.freeze_module() # if you are going to further train it \n""" + \ + """delta_model.log()""" + return usage_str + + def create_default_name(self, **kwargs): + r"""Currently, it's only a simple concatenation of the arguments. + """ + + reponame = "" + reponame += kwargs["backbone_model_path_public"].split("/")[-1]+"_" if kwargs['backbone_model_path_public'] is not None else kwargs['backbone_model'] + reponame += kwargs["delta_type"]+"_" if kwargs["delta_type"] is not None else "" + + # tasks + if isinstance(kwargs["train_tasks"], list): + train_tasks = "+".join(kwargs["train_tasks"]) + elif kwargs["train_tasks"] is not None: + train_tasks = kwargs["train_tasks"] + else: + logger.warning("train_tasks are not find in all arguments. Do you miss them?") + train_tasks = None + reponame += train_tasks+"_" if train_tasks is not None else "" + + # time + reponame += datetime.datetime.now().strftime("%Y%m%d%H%M%S") #+ gen_model_hash(model=self.backbone_model) + + # model hash + if hasattr(self.config, "backbone_hash"): + reponame += self.config.backbone_hash[:3] + return reponame + diff --git a/OpenDelta-0.3.2/opendelta/utils/signature.py b/OpenDelta-0.3.2/opendelta/utils/signature.py new file mode 100644 index 0000000..41aa95e --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/signature.py @@ -0,0 +1,55 @@ +import inspect +from collections import namedtuple + +def signature(f): + r"""Get the function f 's input arguments. A useful gadget + when some function slot might be instantiated into multiple functions. + + Args: + f (:obj:`function`) : the function to get the input arguments. + + Returns: + namedtuple : of args, default, varargs, keywords, respectively.s + + """ + sig = inspect.signature(f) + args = [ + p.name for p in sig.parameters.values() + if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD + ] + varargs = [ + p.name for p in sig.parameters.values() + if p.kind == inspect.Parameter.VAR_POSITIONAL + ] + varargs = varargs[0] if varargs else None + keywords = [ + p.name for p in sig.parameters.values() + if p.kind == inspect.Parameter.VAR_KEYWORD + ] + keywords = keywords[0] if keywords else None + defaults = [ + p.default for p in sig.parameters.values() + if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD + and p.default is not p.empty + ] or None + argspec = namedtuple('Signature', ['args', 'defaults', + 'varargs', 'keywords']) + return argspec(args, defaults, varargs, keywords) + +def get_arg_names(f): + r""" Get a functions argument name, remove the ``self`` argument + """ + args = signature(f).args + if args[0] == "self": + args = args[1:] + return args + + + +def get_arg_names_inside_func(func): + r""" Get the functions argument name inside the function itself. Remove ``self`` argument. + """ + arg_names = func.__code__.co_varnames[: func.__code__.co_argcount] + if arg_names[0] == "self": + arg_names = arg_names[1:] + return arg_names \ No newline at end of file diff --git a/OpenDelta-0.3.2/opendelta/utils/structure_mapping.py b/OpenDelta-0.3.2/opendelta/utils/structure_mapping.py new file mode 100644 index 0000000..9ebc96a --- /dev/null +++ b/OpenDelta-0.3.2/opendelta/utils/structure_mapping.py @@ -0,0 +1,118 @@ +from typing import OrderedDict +import copy +import opendelta.utils.logging as logging +from bigmodelvis import Visualization +logger = logging.get_logger(__name__) + + +from opendelta.utils.common_structures import CoreMappings + +MAPPINGERROR_MSG = f"Available Models with default configurations are {list(CoreMappings.keys())} . Please manually add the delta models by speicifying 'modified_modules' based on the visualization of your model structure. Refer to `https://opendelta.readthedocs.io/en/latest/notes/faq.html` for detail." + + +def transform(org_key, mapping, strict=True, warning=False, verbose=False): + chain = org_key.split(".") + query = "" + node = mapping + + new_chain = [] + virtual_key, virtual_chain, in_virtual_order = None, None, None + for elem in chain: + query += elem + if query in node: + node = node[query] + new_elem = node["__name__"] + if new_elem == "": + if strict: + if warning: + print(f"'{org_key}' has no common mapping.") + return + else: + new_chain.append(query) + else: + splited_new_elem = new_elem.split(".") + splited_new_elem = [e+"@" for e in splited_new_elem] + special_token = '.'.join(splited_new_elem) + if '__virtual__' in node: + virtual_chain = copy.deepcopy(new_chain) + virtual_chain.append(".".join([e+'@' for e in node["__virtual__"].split(".")])) + in_virtual_order = node['__order__'] + new_chain.append(special_token) # special token for transformed key + + + query = "" + elif "$" in node: + node = node["$"] + new_chain.append(query) + query = "" + else: + query += "." + if query!="": + if strict: + if warning: + print("A part of the orginial key hasn't been matched!") + return + else: + new_chain.append(query.strip(".")) # tailing query + + new_key = ".".join(new_chain) + if verbose: + print(f"{org_key} => {new_key}") + if virtual_chain is not None: + virtual_key = ".".join(virtual_chain) + + return new_key, virtual_key, in_virtual_order + + + +class CommonStructureMap(object): + r""" A loading structure map. + """ + + New_Mappings = CoreMappings + + SpecialModelInverseMaps = { + } + def __init__(self, backbone_model, strict=True, warning=False, visualize=True): + self.matched_pairs = {} + self.find_sub_common_structure(backbone_model, matched_pairs=self.matched_pairs) + if len(self.matched_pairs) == 0: + raise KeyError(MAPPINGERROR_MSG) + + + def __repr__(self,): + return self.mapping + + def transform(self, org_key, strict=True, warning=False): + r'''Transform a key in the original model to the name convention in common structure. + ''' + new_key = org_key + virtual_key, in_virtual_order = None, None + + for key in self.matched_pairs: + left, right = org_key[:len(key)], org_key[len(key):].strip(".") + if left == key and len(right) > 0: + transformed_key, virtual_key, in_virtual_order = transform(right, self.matched_pairs[key], strict, warning) + if len(left) > 0: + new_key = left + "." + transformed_key + else: + new_key = transformed_key + break + return new_key, virtual_key, in_virtual_order + + def find_sub_common_structure(self, module, prefix='',matched_pairs = []): + if module.__class__.__name__ in self.New_Mappings: + if self.New_Mappings[module.__class__.__name__]: + if callable(self.New_Mappings[module.__class__.__name__]): + mapping = self.New_Mappings[module.__class__.__name__](module) + else: + mapping = self.New_Mappings[module.__class__.__name__] + matched_pairs[prefix] = mapping + return + for name, m in module.named_children(): + new_prefix = '.'.join([prefix, name]) if prefix != '' else name + self.find_sub_common_structure(m, prefix=new_prefix, matched_pairs = matched_pairs) + + + + diff --git a/OpenDelta-0.3.2/requirements.txt b/OpenDelta-0.3.2/requirements.txt new file mode 100644 index 0000000..2edd9d6 --- /dev/null +++ b/OpenDelta-0.3.2/requirements.txt @@ -0,0 +1,13 @@ +torch>=1.8.0 +transformers>=4.10.0 +datasets>=1.17.0 +sentencepiece>=0.1.96 +tqdm>=4.62.2 +decorator +rich +web.py +gitpython +scipy # need? +sklearn # need? +delta_center_client==0.0.4 +bigmodelvis diff --git a/OpenDelta-0.3.2/setup.cfg b/OpenDelta-0.3.2/setup.cfg new file mode 100644 index 0000000..0642610 --- /dev/null +++ b/OpenDelta-0.3.2/setup.cfg @@ -0,0 +1,5 @@ +[easy_install] + +index_url = https://pypi.org/simple + +# index_url = https://pypi.tuna.tsinghua.edu.cn/simple \ No newline at end of file diff --git a/OpenDelta-0.3.2/setup.py b/OpenDelta-0.3.2/setup.py new file mode 100644 index 0000000..c95566b --- /dev/null +++ b/OpenDelta-0.3.2/setup.py @@ -0,0 +1,62 @@ + +import setuptools +import os +import os + + +requires = """torch>=1.8.0 +transformers>=4.10.0 +datasets>=1.17.0 +sentencepiece>=0.1.96 +tqdm>=4.62.2 +decorator +rich +web.py +gitpython +scipy # need? +sklearn # need? +delta_center_client==0.0.4 +bigmodelvis +""" + +def get_requirements(): + ret = [x for x in requires.split("\n") if len(x)>0] + print("requirements:", ret) + return ret + + + +# path = os.path.dirname(os.path.abspath(__file__)) +# requires = get_requirements(path) + +with open('README.md', 'r') as f: + setuptools.setup( + name = 'opendelta', + version = "0.3.2", + description = "An open source framework for delta learning (parameter efficient learning).", + long_description=open("README.md", "r", encoding="utf-8").read(), + long_description_content_type="text/markdown", + author = '', + author_email = 'shengdinghu@gmail.com', + license="Apache", + url="https://github.com/thunlp/OpenDelta", + keywords = ['PLM', 'Parameter-efficient-Learning', 'AI', 'NLP'], + python_requires=">=3.6.0", + install_requires=get_requirements(), + package_dir={'opendelta':'opendelta'}, + package_data= { + 'opendelta':["utils/interactive/templates/*.html", 'requirments.txt'], + }, + include_package_data=True, + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + ] + )