Compare commits

...

7 Commits
FM_9G ... FM_9G

Author SHA1 Message Date
wql 940a8b08cb add: add opendelta package 2024-08-08 13:44:48 +08:00
wql 9175f7c9bb fix: change to GPUS_PER_NODE=1 2024-08-08 09:18:29 +08:00
wql 7655337c72 fix: change to GPUS_PER_NODE=2 2024-08-08 09:03:43 +08:00
wql 12f7320b51 fix: change GPUS_PER_NODE to 8 2024-08-07 16:49:33 +08:00
wql db532ca4b1 fix: modify paras in pretrain_dragonfly 2024-08-07 16:00:45 +08:00
wql a62a188fd5 fix: change --language to zh 2024-08-07 14:21:33 +08:00
wql 5de2ff4556 test 2024-08-07 14:03:19 +08:00
234 changed files with 22122 additions and 2 deletions

View File

@ -222,7 +222,7 @@ fi
GPUS_PER_NODE=1
NNODES=1
RANK=0
MASTER_ENDPOINT=g3006
MASTER_ENDPOINT=ubuntu
MASTER_PORT=23456
#CMD="torchrun --nnodes=${NNODES} --nproc_per_node=${GPUS_PER_NODE} --node_rank=${RANK} --master_addr=${MASTER_ENDPOINT} --master_port=${MASTER_PORT} ${PRETRAIN_ENTRY} ${OPTS}"
CMD="torchrun --nnodes=${NNODES} --nproc_per_node=${GPUS_PER_NODE} --node_rank=${RANK} --rdzv_id=1 --rdzv_backend=c10d --rdzv_endpoint=${MASTER_ENDPOINT}:${MASTER_PORT} ${PRETRAIN_ENTRY} ${OPTS}"

71
OpenDelta-0.3.2/.gitignore vendored Normal file
View File

@ -0,0 +1,71 @@
data/
**/__pycache__/
logs/*
experiments/logs
!logs/.gitkeep
datasets/*
!datasets/*.sh
.vscode/
*.egg-info/
eggs/
.eggs/
*.egg
**.egg
build/
_build/
**/build/
outputs/
log.txt
**/DeltaHub/
**/sfs_scripts/
*beans/
**/examples/*/configs/*
!examples/*/configs/config_gen.py
**/jupyter_notebook_examples/
!examples/jupyter_notebook_examples/*.py
!examples/*/configs/*.py
**/outputs_search/**/*.bin
**/outputs_search/**/*.pt
*.db
**/nohup.out
**/examples/examples_bmtrain/BigModels/down_data
**/examples/examples_bmtrain/BMTrain_stable
**/examples/examples_bmtrain/BMPretrain
**/examples/examples_bmtrain/BigModels/BigModels/results
**/Delta_Memory/
**/output/
**/thunlp/
**/saved_ckpts/
DeltaCenter-Python-Client/
backbone_structure
delta_checkpoints
gitop.sh
load_dataset_and_model.ipynb
load_model.py
scripts
t.py
t.sh
!examples/examples_prompt/configs/*/*.json
!examples/examples_prompt/configs/**
**/delta_checkpoints/
**/outputs/
dist/
dist/*
**/unittest/**
!unittest/**.py
!unittest/**.sh
!unittest/**.md
**/tutorial/**
!tutorial/**.py
!tutorial/**.sh
!tutorial/**.md

View File

@ -0,0 +1,29 @@
# .readthedocs.yaml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 1
# Set the version of Python and other tools you might need
build:
os: ubuntu-20.04
tools:
python: "3.9"
# You can also specify other tool versions:
# nodejs: "16"
# rust: "1.55"
# golang: "1.17"
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py
# If using Sphinx, optionally build your docs in additional formats such as PDF
# formats:
# - pdf
# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: docs/requirements.txt

201
OpenDelta-0.3.2/LICENSE Normal file
View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

161
OpenDelta-0.3.2/README.md Normal file
View File

@ -0,0 +1,161 @@
<div align="center">
<img src="https://s4.ax1x.com/2022/02/14/Hy7lAf.png" width="350px">
**An Open-Source Framework for Paramter-Efficient Tuning (Delta Tuning).**
------
<p align="center">
<a href="#Overview">Overview</a>
<a href="#installation">Installation</a>
<a href="https://opendelta.readthedocs.io/en/latest/notes/usage.html">Basic Usage</a>
<a href="https://opendelta.readthedocs.io/">Docs</a>
<a href="https://docs.google.com/spreadsheets/d/1BIVa8ocAPga-u7rBOXLYaTfaJSjI1dWfwohmLjmFDrY/edit?usp=sharing">Performance</a>
</p>
</div>
![version](https://img.shields.io/badge/version-0.3.2-blue)
## Overview
OpenDelta is a toolkit for parameter-efficient tuning methods (we dub it as *delta tuning*), by which users could flexibly assign (or add) a small amount parameters to update while keeping the most paramters frozen. By using OpenDelta, users could easily implement prefix-tuning, adapters, Lora, or any other types of delta tuning with preferred PTMs.
- The latest version of OpenDelta is tested on Python==3.8.13, PyTorch==1.12.1, transformers==4.22.2. Other versions are likely to be supported as well. If you encounter bugs when using your own package versions, please raise an issue, we will look into it as soon as possible.
- **A demo of using Opendelta to modify the PLM (E.g., BART).**
![How PLM changes using Delta-tuning](docs/source/imgs/demo.gif)
## News
- **2022.10.25** Release v0.3.2. Support [BMTrain]()! Improve docs. Add inspect utilities.
- **2022.10.14** Release v0.3.0. We make the usage of default configurations of each delta tuning methods (i.e., the position they are attached) more friendly! If a custom model has our supported models as submodules inside, the default configuration is also available. Other key changes can be seen in [Update Log](https://opendelta.readthedocs.io/en/latest/notes/update.html#version-0-3-0)
- **2022.10.10** Merge a long-developed branch v0.2.4 into the master branch. Key updates are (1) the an example unifying the delta tuning paradigm and the prompt-tuning paradigm; (2) and support for [Delta Center](https://www.openbmb.org/toolKits/deltacenter), whose webpage is still under construction. Details can be seen in [Update Log](https://opendelta.readthedocs.io/en/latest/notes/update.html#version-0-2-4)
- **2022.03.24** We notice several bugs in Soft Prompt Tuning and Prefix Tuning, mainly due to their need to customize attention ids, token_type_ids, we are fixing it! Currently, please use the other methods since they are stabler and better in performance.
- **2022.03.20** Add a [colab example](https://colab.research.google.com/drive/1uAhgAdc8Qr42UKYDlgUv0f7W1-gAFwGo?usp=sharing) to illustrate efficient training and space-saving multitask-serving.
- **2022.03.20** A new pip version released.
- **2022.02.16** Support [regular expression](https://opendelta.readthedocs.io/en/latest/notes/namebasedaddr.html#regexexpr) in named-based addressing.
## Installation
1. create a virtualenv (optional)
```shell
conda create -n opendelta_env python=3.8
conda activate opendelta_env
```
2 install the lastest version
```bash
pip install git+https://github.com/thunlp/OpenDelta.git
```
**or** install the lastest pip version (more stable)
```bash
pip install opendelta
```
**or** build from source
```bash
git clone git@github.com:thunlp/OpenDelta.git
cd OpenDelta
python setup.py install
# python setup.py develop # if you want to do some modifications on the code for your research:
```
## Must Try
The following codes and comments walk you through the key functionality of OpenDelta. It is also in [must_try.py](https://github.com/thunlp/OpenDelta/tree/main/examples/unittest/must_try.py) and [must_try.ipynb in colab](https://colab.research.google.com/drive/1Nbe9zxt8LGQnKmtvEs07IN_PznjNCyk4?usp=sharing).
```python
# use tranformers as usual.
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
t5_tokenizer = AutoTokenizer.from_pretrained("t5-large")
# A running example
inputs_ids = t5_tokenizer.encode("Is Harry Poter wrtten by JKrowling", return_tensors="pt")
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
# >>> '<pad><extra_id_0>? Is it Harry Potter?</s>'
# use existing delta models
from opendelta import AutoDeltaModel, AutoDeltaConfig
# use existing delta models from DeltaCenter
delta = AutoDeltaModel.from_finetuned("thunlp/Spelling_Correction_T5_LRAdapter_demo", backbone_model=t5)
# freeze the whole backbone model except the delta models.
delta.freeze_module()
# visualize the change
delta.log()
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
# >>> <pad> Is Harry Potter written by JK Rowling?</s>
# Now save merely the delta models, not the whole backbone model, to tmp/
delta.save_finetuned(".tmp")
import os; os.listdir(".tmp")
# >>> The state dict size is 1.443 MB
# >>> We encourage users to push their final and public models to delta center to share them with the community!
# reload the model from local url and add it to pre-trained T5.
t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
delta1 = AutoDeltaModel.from_finetuned(".tmp", backbone_model=t5)
import shutil; shutil.rmtree(".tmp") # don't forget to remove the tmp files.
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
# >>> <pad> Is Harry Potter written by JK Rowling?</s>
# detach the delta models, the model returns to the unmodified status.
delta1.detach()
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
# >>> '<pad><extra_id_0>? Is it Harry Potter?</s>'
# use default configuration for cunstomized wrapped models which have PLMs inside. This is a common need for users.
import torch.nn as nn
class WrappedModel(nn.Module):
def __init__(self, inner_model):
super().__init__()
self.inner = inner_model
def forward(self, *args, **kwargs):
return self.inner(*args, **kwargs)
wrapped_model = WrappedModel(WrappedModel(t5))
# say we use LoRA
delta_config = AutoDeltaConfig.from_dict({"delta_type":"lora"})
delta2 = AutoDeltaModel.from_config(delta_config, backbone_model=wrapped_model)
delta2.log()
# >>> root
# -- inner
# -- inner
# ...
# ... lora_A:[8,1024], lora_B:[1024,8]
delta2.detach()
# use a not default configuration
# say we add lora to the last four layer of the decoder of t5, with lora rank=5
delta_config3 = AutoDeltaConfig.from_dict({"delta_type":"lora", "modified_modules":["[r]decoder.*((20)|(21)|(22)|(23)).*DenseReluDense\.wi"], "lora_r":5})
delta3 = AutoDeltaModel.from_config(delta_config3, backbone_model=wrapped_model)
delta3.log()
```
## Verified Default Configurations
- **You can try to use OpenDelta on *any* backbone models based on PyTorch.**
- However, with small chances that the interface of the submodules of the backbone model is not supported. Therefore we verified some commonly
used models that OpenDelta are sure to support.
- We will keep testing more and more emerging models.
- Pull requests are welcomed when you successfully apply OpenDelta on your own backbone model.

View File

@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

View File

@ -0,0 +1,35 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.https://www.sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd

View File

@ -0,0 +1,20 @@
# OpenDelta Documentation
To build this doc locally, please firstly install [sphinx](https://www.sphinx-doc.org/en/master/) packages.
```
pip install sphinx
pip install sphinx_rtd_theme
pip install sphinx_copybutton
pip install sphinx_toolbox
pip install myst_parser
```
Then install opendelta either from source, or from pip. After that,
```
cd docs
make html
```
Then open the generated `docs/build/html/index.html` in your local browser.

View File

@ -0,0 +1,17 @@
sphinx_copybutton
sphinx_rtd_theme
sphinx_toolbox
myst_parser
torch>=1.8.0
transformers>=4.10.0
datasets==1.17.0
sentencepiece>=0.1.96
tqdm>=4.62.2
decorator
rich
web.py
gitpython
scipy # need?
sklearn # need?
delta_center_client==0.0.4

View File

@ -0,0 +1,268 @@
/* a, */
.wy-menu-vertical header,
.wy-menu-vertical p.caption,
.wy-nav-top .fa-bars,
.wy-menu-vertical a:hover,
/* Colors and text decoration.
For example, :black:`text in black` or :blink:`text blinking` in rST. */
/* .black {
color: black;
}
.gray {
color: gray;
}
.grey {
color: gray;
}
.silver {
color: silver;
}
.white {
color: white;
}
.maroon {
color: maroon;
}
.red {
color: red;
}
.magenta {
color: magenta;
}
.fuchsia {
color: fuchsia;
}
.pink {
color: pink;
}
.orange {
color: rgba(218, 135, 12, 0.897);
} */
/* .string {
color: rgb(172, 51, 44);
} */
/* .yellow {
color: yellow;
}
.lime {
color: lime;
}
.green {
color: green;
}
.olive {
color: olive;
}
.teal {
color: teal;
}
.cyan {
color: cyan;
}
.aqua {
color: aqua;
}
.blue {
color: blue;
}
.navy {
color: navy;
}
.purple {
color: purple;
}
.under {
text-decoration: underline;
}
.over {
text-decoration: overline;
}
.blink {
text-decoration: blink;
}
.line {
text-decoration: line-through;
}
.strike {
text-decoration: line-through;
}
.it {
font-style: italic;
}
.ob {
font-style: oblique;
}
.small {
font-size: small;
}
.large {
font-size: large;
}
.smallpar {
font-size: small;
} */
a:link {
color: rgb(141, 99, 224)
}
a:visited {
color: rgb(141, 99, 224)
}
a:hover {
color: rgb(147, 47, 218)
}
.rst-content code.literal
{
color: rgb(172, 49, 42) !important;
/* #5360f0 */
}
.rst-content tt.literal
{
color: #f06b53 !important;
}
/* #a153f0 */
/* inspired by sphinx press theme */
.wy-menu.wy-menu-vertical li.toctree-l1.current > a {
border-left: solid 15px rgb(150, 92, 232) !important;
text-indent: -15px;
border-top: none;
border-bottom: none;
}
.wy-menu.wy-menu-vertical li.toctree-l1.current > ul {
border-left: solid 15px #ddcaf7 !important;
}
/* inspired by sphinx press theme */
.wy-nav-side {
color: unset !important;
background: unset !important;
border-right: solid 1px #ccc !important;
}
.wy-side-nav-search,
.wy-nav-top,
.wy-menu-vertical li,
.wy-menu-vertical li a:hover,
.wy-menu-vertical li a
{
background: unset !important;
}
.wy-menu-vertical li.current a {
border-right: unset !important;
}
.wy-side-nav-search div,
.wy-menu-vertical a {
color: #404040 !important;
}
.wy-menu-vertical button.toctree-expand {
color: #333 !important;
}
.wy-nav-content {
max-width: unset;
}
.rst-content {
max-width: 900px;
}
.wy-nav-content .icon-home:before {
content: "Docs";
}
.wy-side-nav-search .icon-home:before {
content: "";
}
dl.field-list {
display: block !important;
}
dl.field-list > dt:after {
content: "" !important;
}
dl.field-list > dt {
display: table;
padding-left: 6px !important;
padding-right: 6px !important;
margin-bottom: 4px !important;
padding-bottom: 1px !important;
background: rgb(252, 237, 208);
border-left: solid 2px rgb(231, 181, 134);
}
dl.py.class>dt
{
color: rgba(17, 16, 17, 0.822) !important;
background: rgb(247, 234, 252) !important;
border-top: solid 2px #b620d0 !important;
}
dl.py.method>dt
{
background: rgb(250, 239, 241) !important;
border-left: solid 2px rgb(199, 83, 106) !important;
}
dl.py.attribute>dt,
dl.py.property>dt
{
background: rgba(194, 233, 248, 0.1) !important;
border-left: solid 2px #58b5cc !important;
}
.fa-plus-square-o::before, .wy-menu-vertical li button.toctree-expand::before,
.fa-minus-square-o::before, .wy-menu-vertical li.current > a button.toctree-expand::before, .wy-menu-vertical li.on a button.toctree-expand::before
{
content: "";
}
.rst-content .viewcode-back,
.rst-content .viewcode-link
{
font-size: 120%;
}

View File

@ -0,0 +1,7 @@
document.addEventListener("DOMContentLoaded", function(event) {
document.querySelectorAll(".wy-menu.wy-menu-vertical > ul.current > li > a").forEach(a => a.addEventListener("click", e=>{
f = document.querySelector(".wy-menu.wy-menu-vertical > ul.current > li > ul")
if (f.style.display=='none') { f.style.display='block'; } else f.style.display = 'none'
}));
document.querySelectorAll(".headerlink").forEach(a => a.text="\u{1F517}");
});

View File

@ -0,0 +1,147 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
import sys
sys.path.insert(0, "../../")
import datetime
import sphinx_rtd_theme
import doctest
import opendelta
# -- Project information -----------------------------------------------------
project = 'OpenDelta'
author = 'THUNLP OpenDelta Team'
copyright = '{}, {}, Licenced under the Apache License, Version 2.0'.format(datetime.datetime.now().year, author)
# The full version, including alpha/beta/rc tags
release = '0.3.2'
version = "0.3.2"
html_theme = 'sphinx_rtd_theme'
html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
doctest_default_flags = doctest.NORMALIZE_WHITESPACE
autodoc_member_order = 'bysource'
intersphinx_mapping = {'python': ('https://docs.python.org/', None),
"torch": ("https://pytorch.org/docs/stable/", None),}
html_show_sourcelink = True
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.autosummary',
'sphinx.ext.doctest',
'sphinx.ext.intersphinx',
# 'sphinx.ext.mathbase',
'sphinx.ext.mathjax',
'sphinx.ext.napoleon',
'sphinx.ext.viewcode',
'sphinx.ext.githubpages',
'sphinx_copybutton',
'sphinx_toolbox.collapse',
'myst_parser',
]
myst_enable_extensions = [
"html_image",
"colon_fence",
"html_admonition",
"amsmath",
"dollarmath",
]
source_suffix = {
'.rst': 'restructuredtext',
'.txt': 'markdown',
'.md': 'markdown',
}
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
# exclude_patterns = []
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
# html_theme = 'alabaster'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_theme_options = {
# 'collapse_navigation': False,
# 'display_version': True,
#'logo_only': False,
'navigation_depth': 2,
}
html_static_path = ['_static']
html_css_files = ['css/custom.css']
html_js_files = ['js/custom.js']
rst_context = {'opendelta': opendelta}
# rst_epilog = "\n.. include:: .special.rst\n"
add_module_names = False
def include_only_tagged(app, what, name, obj, skip, options):
inclusion_tag_format = "[NODOC]" #can be any pattern here, choose what works for you
for tag in app.tags.tags:
if obj.__doc__ is not None and not obj.__doc__.startswith(inclusion_tag_format):
return False
return True
def skip2(app, what, name, obj, skip, options):
members = [
'__init__',
'__repr__',
'__weakref__',
'__dict__',
'__module__',
]
return True if name in members else skip
def skip(app, what, name, obj, skip, options):
skip = include_only_tagged(app, what, name, obj, skip, options) or\
skip2(app, what, name, obj, skip, options)
return skip
def setup(app):
def rst_jinja_render(app, docname, source):
src = source[0]
rendered = app.builder.templates.render_string(src, rst_context)
source[0] = rendered
app.connect('autodoc-skip-member', skip)
app.connect("source-read", rst_jinja_render)

Binary file not shown.

After

Width:  |  Height:  |  Size: 206 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 167 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 139 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 203 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 182 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 94 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 185 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 186 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.3 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 110 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 306 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 181 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 173 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 181 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 225 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 218 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 181 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 84 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 320 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 139 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 372 KiB

View File

@ -0,0 +1,75 @@
OpenDelta's documentation!
=====================================
[OpenDelta](https://github.com/thunlp/OpenDelta/) is a **Plug-and-play** Library of the parameter-efficient fine-tuning ([delta-tuning](WhatisDelta)) technology for pre-trained models.
## Essential Advantages:
- <span style="color:rgb(81, 217, 245);font-weight:bold">Clean:</span> No need to edit the backbone PTMs codes.
- <span style="color:orange;font-weight:bold">Simple:</span> Migrating from full-model tuning to delta-tuning needs as little as 3 lines of codes.
- <span style="color:green;font-weight:bold">Sustainable:</span> Most evolution in external library doesnt require a new OpenDelta.
- <span style="color:red;font-weight:bold">Extendable:</span> Various PTMs can share the same delta-tuning codes.
- <span style="color:purple;font-weight:bold">Flexible:</span> Able to apply delta-tuning to (almost) any position of the PTMs.
```{eval-rst}
.. toctree::
:maxdepth: 1
:caption: Getting Started
notes/overview.md
notes/installation.md
notes/quickstart.md
notes/custom.md
.. toctree::
:maxdepth: 1
:caption: Advanced Usage
notes/autodelta.md
notes/deltacenter.md
notes/composition.md
notes/pluginunplug.md
notes/withbmtrain.md
notes/withaccelerate.md
notes/examples.md
.. toctree::
:maxdepth: 1
:caption: Utilities
notes/inspect.md
.. toctree::
:maxdepth: 1
:caption: Mechanisms
notes/keyfeature.md
notes/namebasedaddr.md
notes/unifyname.md
.. toctree::
:maxdepth: 1
:caption: Information
notes/citation.md
notes/update.md
notes/faq.md
.. toctree::
:maxdepth: 2
:caption: Documentation
modules/base
modules/deltas
modules/auto_delta
modules/utils
Indices and tables
==================
* :ref:`genindex`
```

View File

@ -0,0 +1,14 @@
Auto Classes
======================================
AutoDeltaConfig
------------------------------------
.. autoclass:: opendelta.auto_delta.AutoDeltaConfig
:members:
AutoDeltaModel
------------------------------------
.. autoclass:: opendelta.auto_delta.AutoDeltaModel
:members:

View File

@ -0,0 +1,14 @@
Base Classes
======================================
BaseDeltaConfig
------------------------------------
.. autoclass:: opendelta.delta_configs.BaseDeltaConfig
:members:
DeltaBase
------------------------------------
.. autoclass:: opendelta.basemodel.DeltaBase
:members:

View File

@ -0,0 +1,46 @@
Delta Models
======================================
Lora
---------------------------------------
.. autoclass:: opendelta.LoraModel
:members:
BitFit
---------------------------------------
.. autoclass:: opendelta.BitFitModel
:members:
Adapter
---------------------------------------
.. autoclass:: opendelta.AdapterModel
:members:
LowRankAdapter
---------------------------------------
.. autoclass:: opendelta.LowRankAdapterModel
:members:
Compacter
---------------------------------------
.. autoclass:: opendelta.CompacterModel
:members:
Prefix tuning
------------------------------------
.. autoclass:: opendelta.PrefixModel
:members:
Soft Prompt Tuning
------------------------------------
.. autoclass:: opendelta.SoftPromptModel
:members:

View File

@ -0,0 +1,45 @@
# Utils
## SaveLoadMixin
```{eval-rst}
.. autoclass:: opendelta.utils.saving_loading_utils.SaveLoadMixin
:members:
```
## Visualization
```{eval-rst}
.. autoclass:: opendelta.utils.visualization.Visualization
:members:
```
## Structure Map
```{eval-rst}
.. autoclass:: opendelta.utils.structure_mapping.CommonStructureMap
:members:
```
## Utility Functions
### Hashing
```{eval-rst}
.. automodule:: opendelta.utils.model_md5
:members:
```
### Signature
```{eval-rst}
.. automodule:: opendelta.utils.signature
:members:
```
### Named-based addressing
```{eval-rst}
.. automodule:: opendelta.utils.name_based_addressing
:members:
```

View File

@ -0,0 +1,90 @@
(autodelta)=
# AutoDelta Mechanism
Inspired by [Huggingface transformers AutoClasses](https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/auto#transformers.AutoModel) , we provide an AutoDelta features for the users to
1. Easily to experiment with different delta models
2. Fast deploy from configuration file, especially from the repos in [DeltaCenter](https://examplelink).
## Easily load from dict, so that subject to change the type of delta models.
```python
from opendelta import AutoDeltaConfig, AutoDeltaModel
from transformers import T5ForConditionalGeneration
backbone_model = T5ForConditionalGeneration.from_pretrained("t5-base")
```
We can load a config from a dict
```python
config_dict = {
"delta_type":"lora",
"modified_modules":[
"SelfAttention.q",
"SelfAttention.v",
"SelfAttention.o"
],
"lora_r":4}
delta_config = AutoDeltaConfig.from_dict(config_dict)
```
Then use the config to add a delta model to the backbone model
```python
delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=backbone_model)
# now visualize the modified backbone_model
from bigmodelvis import Visualization
Visualizaiton(backbone_model).structure_graph()
```
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/t5lora.png
---
width: 600px
name: t5lora
---
```
````
## Fast deploy from a finetuned delta checkpoints from DeltaCenter
```python
# use tranformers as usual.
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
t5_tokenizer = AutoTokenizer.from_pretrained("t5-large")
# A running example
inputs_ids = t5_tokenizer.encode("Is Harry Poter wrtten by JKrowling", return_tensors="pt")
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
# >>> '<pad><extra_id_0>? Is it Harry Potter?</s>'
```
Load delta model from delta center:
```python
# use existing delta models
from opendelta import AutoDeltaModel, AutoDeltaConfig
# use existing delta models from DeltaCenter
delta = AutoDeltaModel.from_finetuned("thunlp/Spelling_Correction_T5_LRAdapter_demo", backbone_model=t5)
# freeze the whole backbone model except the delta models.
delta.freeze_module()
# visualize the change
delta.log()
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
# >>> <pad> Is Harry Potter written by JK Rowling?</s>
```
<div class="admonition note">
<p class="title">**Hash check**</p>
Since the delta model only works together with the backbone model.
we will automatically check whether you load the delta model the same way it is trained.
</p>
<p>
We calculate the trained model's [md5](http://some_link) and save it to the config. When finishing loading the delta model, we will re-calculate the md5 to see whether it changes.
<p> Note that performance is guaranteed by passing the hash check, but there are cases where the hash check is not passed but performance is still normal for various reasons. We are checking the reasons for this. Please consider this feature as a supplement. </p>
<p>Pass `check_hash=False` to disable the hash checking.</p>
</div>

View File

@ -0,0 +1,12 @@
# Citation
If you find our repo useful, please cite the following paper.
```
@article{ding2022delta,
title={Delta tuning: A comprehensive study of parameter efficient methods for pre-trained language models},
author={Ding, Ning and Qin, Yujia and Yang, Guang and Wei, Fuchao and Yang, Zonghan and Su, Yusheng and Hu, Shengding and Chen, Yulin and Chan, Chi-Min and Chen, Weize and others},
journal={arXiv preprint arXiv:2203.06904},
year={2022}
}
```

View File

@ -0,0 +1,51 @@
# Composition of delta models
With OpenDelta, you can perform compostion of different delta models.
## Add different deltas to the backbone
```
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("roberta-base")
from opendelta import LoraModel, AdapterModel
delta_model = LoraModel(backbone_model=model, modified_modules=['key'], lora_r=1)
delta_model2 = AdapterModel(backbone_model=model, modified_modules=['output'], bottleneck_dim=12)
delta_model.log()
```
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/composition_of_delta.png
---
width: 600px
name: composition_of_delta
---
```
````
## Even add multiple delta to the same layer
```
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base")
from opendelta import AdapterModel, LowRankAdapterModel
delta_model = AdapterModel(backbone_model=model, modified_modules=['fc2'])
delta_model2 = AdapterModel(backbone_model=model, modified_modules=['fc2'], bottleneck_dim=12)
delta_model3 = LowRankAdapterModel(backbone_model=model, modified_modules=['fc2'], reduction_factor=12)
delta_model.log()
```
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/multiple_to_one_layer.png
---
width: 600px
name: multiple_to_one_layer
---
```
````
:::{admonition} Order of Insertion
:class: warning
**When adding to the same layer, please pay attention to the order of adding delta.** As the above example, adapter is added after the `fc2`, the tensor will first go through `adapter` then go through `adapter_1`, at last `compacter`. If the delta is added before the backbone layer, then the last added delta will be the first to go through.
Also, pay attention to the detaching order. The delta that is first added should be the last to be detached.
:::

View File

@ -0,0 +1,135 @@
# Custom Usage
Now we introduce the pipeline to migrate your full-model tuning scripts to a delta tuning one, **especial when your model is not in the default configuration list, or you don't want to use ghte default configuration**.
## STEP 1: Load the pretrained models
```python
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base") # suppose we load BART
```
## STEP 2: Add delta modules
We provide two alternatives to add the delta modules.
### 2.1 Visualize the backbone structure
Delta tuning's core change in the structure of the base model is to decorate (modify) the modules of the base model with small delta modules. We assume we want to treat the feedforward layer of each block as our [target modules](targetmodules). Since **different PLM name the submodules differently**,
We should first know the name of the feedforward layer in the BART model by visualization. <img src="../imgs/hint-icon-2.jpg" height="30px"> *For more about visualization, see [Visualization](visualization).*
```python
from bigmodelvis import Visualization
Visualization(model).structure_graph()
```
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/bart-base.png
---
width: 600px
name: bart-base
---
```
````
We can see from the structure graph that the feed forward layer in Bart is called `model.encoder.layers.$.fc1` and `model.encoder.layers.$.fc2`, where
`$` represent a number from 0-5. Since we want to apply adapter after *all* the feed forward layers, we specify the `modified_modules=['fc2']`, which is the common suffix for feed forward layers.
<img src="../imgs/hint-icon-2.jpg" height="30px"> *For details about the name based addressing, see [Name-based submodule addressing](namebasedaddr)*
Other configurations, such as the `bottleneck_dim` in Adapter, can be passed as key word arguments.
```python
from opendelta import AdapterModel
delta_model = AdapterModel(backbone_model=model, modified_modules=['fc2'], bottleneck_dim=12)
delta_model.log() # This will visualize the backbone after modification and other information.
```
:::{admonition} Try different positions
:class: tip
OpenDelta provide the flexibility to add delta to various positions on the backbone model. For example, If you want to move the adapter in the above example after the layer norm of the feed forward layer. The code should be changed into
```python
delta_model = AdapterModel(backbone_model=model, modified_modules=['final_layer_norm'], bottleneck_dim=12)
```
The performance may vary due to positional differences, but there is currently theorectical guarantee that one will outperform the other.
:::
:::{admonition} Favored Configurations
:class: tip
Feel confused about the flexibility that OpenDelta brings? The default configuration is the `default_modified_modules` attributes of each Delta model. Generally, the default configurations are already good enough. If you want squeeze the size of delta models further, you can refer to the following papers.
- [AdapterDrop: On the Efficiency of Adapters in Transformers](https://arxiv.org/abs/2010.11918)
- [Sparse Structure Search for Parameter-Efficient Tuning(Delta Tuning)](https://arxiv.org/abs/2206.07382)
:::
## STEP 3: Freeze parameters
So far the backbone model is still fully tunable. To freeze the main part of the backbone model except the trainable parts (usually the delta paramters), use [freeze_module](opendelta.basemodel.DeltaBase.freeze_module) method. The syntax of `exclude` field also obeys the [name-based addressing](namebasedaddr) rules.
```python
delta_model.freeze_module(exclude=["deltas", "layernorm_embedding"])
delta_model.log()
```
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/afterfreeze.png
---
width: 600px
name: afterfreeze
---
```
````
Usually, we want to only save the trainable part, then we should modify the `state_dict` of the backbone model which original contains all the parameters. Now with `set_state_dict=True`, the `model.state_dict()` only contains the trainable parameters.
```python
delta_model.freeze_module(exclude=["deltas", "layernorm_embedding"], set_state_dict=True)
```
## STEP 4: Normal training pipeline
The **model** then can be trained in traditional training scripts. Two things should be noticed:
:::{admonition} Note
:class: note
1. No need to change the optimizer, since the optimizer will only calculated and store gradient for those parameters with `requires_grad=True`, and the `requires_grad` attribute has been changed during the call to [freeze_module](opendelta.basemodel.DeltaBase.freeze_module) method.
2. `model.eval()` or `model.train()` should be used if we need to enable/disable dropout. Opendelta doesn't touch those configuration.
:::
## STEP 5: Save and load the Delta Model
### Option1: Use opendelta interface.
One option is to use our provided interface. This will save both the configurations of the delta model and the parameters of all trainable parameters.
```python
delta_model.save_finetuned("some_local_path/")
```
When loading the delta_model, just call the `from_finetuned` methods. Note that the loaded model is fully trainable. If you want to continue to train it, please use `freeze_module` again.
```python
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base")
from opendelta import AutoDeltaModel
delta_model = AutoDeltaModel.from_finetuned("some_local_path/", backbone_model=model)
```
### Option2: Use pytorch interface.
Another option is to load the model using traditional pytorch ways.
```python
torch.save(model.state_dict(), "some_local_path/pytorch_model.bin")
```
Then load it into an initialied backbone model with delta model. Remember to use `strict=False` since now the state_dict contains only the trainable parameters.
```python
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base")
from opendelta import AdapterModel
delta_model = AdapterModel(backbone_model=model, modified_modules=['fc2'], bottleneck_dim=12)
model.load_state_dict(torch.load("some_local_path/pytorch_model.bin"), strict=False)
```
### Option3: Save and upload to DeltaCenter.
You can also save the delta model to delta center to share with the community. See [instructions](deltacenter).

View File

@ -0,0 +1,35 @@
# DeltaCenter
## Share to Delta Center.
```python
delta_model.save_finetuned("test_delta_model", push_to_dc = True)
```
## Download from Delta Center.
```python
# use tranformers as usual.
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
t5_tokenizer = AutoTokenizer.from_pretrained("t5-large")
# A running example
inputs_ids = t5_tokenizer.encode("Is Harry Poter wrtten by JKrowling", return_tensors="pt")
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
# >>> '<pad><extra_id_0>? Is it Harry Potter?</s>'
```
Load delta model from delta center:
```python
# use existing delta models
from opendelta import AutoDeltaModel, AutoDeltaConfig
# use existing delta models from DeltaCenter
delta = AutoDeltaModel.from_finetuned("thunlp/Spelling_Correction_T5_LRAdapter_demo", backbone_model=t5)
# freeze the whole backbone model except the delta models.
delta.freeze_module()
# visualize the change
delta.log()
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
# >>> <pad> Is Harry Potter written by JK Rowling?</s>
```

View File

@ -0,0 +1,16 @@
# Examples
## examples_prompt
| | Lora | Bias<br>Tuning | Adapter<br>Houstbly | Adapter<br>Preffier | Adapter<br>Drop | Adapater<br> Low-Rank | Compactor |Prefix<br> Tuning | Prompt <br> Tuning |
| --------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ----- | ----- |
| T5 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
| BART | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
| DistilBERT | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | |
| BERT | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| T5-3b(parallel)| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Deberta-v2 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | |
| CTRL | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | |
## tutorials

View File

@ -0,0 +1,14 @@
# FAQs
1. **Why I encounder NotImplementedError in Prefix Tuning?**
This is because we find no easy way to get a unified Prefix Tuning implementation for different attention classes. If you really want to use Prefix Tuning for the models we have not supported, you can implement the ``PrefixLayerYOURMODEL`` on your own or raise a issue to request the feature for your model.
2. **Available Models with default configurations are ..., Please manually add the delta models by speicifying 'modified_modules' based on the visualization of your model structure**
Although most pre-trained models (PTMs) use the transformers archtecture, they are implemented differently. For example, the attention module in GPT2 and BERT is not only named differently, but also implemented in different ways. Common structure mapping mapps the different name conventions of different PTMs into a unified name convention. But there are many PTMs that we do not currently cover. But don't worry! For these models, you can figure out which modules should you modify by simply [visualizing the PTMs](visualization), and then specify the `modified modules` manually (See [name-based addressing](namebasedaddr)).
3. **Requires a dummy_inputs to be passed through the model to understand the dimensionality of each tensor in the computation graph. The {module.__class__.__name__} Class has no dummy_inputs, and automatically created dummy_inputs failed.**
The `dummy_inputs` can be any data that make `backbone_model.forward(**dummy_inputs)` succeed. Only the form and shape of the `dummy_inputs` matter. To set dummy_inputs for your model, please use: `setattr(backbone_model, 'dummy_inputs', some_dummy_inputs)` before initializing `{self.__class__.__name__}`.

View File

@ -0,0 +1,129 @@
(visualization)=
# Visualize the Parameters
When OpenDelta makes modifications to a pretrained model (PTM), it is beneficial to know what your PTM looks like, especially the location of the parameters.
- **Before** applying opendelta, you can know **how to specify your modifications in terms of key addressing**.
- **After** the modification is done, you can know **if your modification is what you expected**, for example, whether the position of the delta
modules are desired, or whether you froze the correct parameters.
Now let's begin to try the visualization utility.
## Visualization is NOT easy using pytorch native function.
```python
from transformers import BertForMaskedLM
backbone_model = BertForMaskedLM.from_pretrained("bert-base-uncased")
print(backbone_model)
```
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/raw_print.png
---
width: 600px
name: raw_print
---
```
````
The original presentation of models is **not tailored for repeated structures, big models, or parameters-centric tasks**.
## Using visualization from bigmodelvis.
First let's visualize all the parameters in the bert model. As we can see, structure inside a bert model, and the all the paramters location of the model are neatly represented in tree structure. (See [color scheme](color_schema) for the colors)
```python
from bigmodelvis import Visualization
model_vis = Visualization(backbone_model)
model_vis.structure_graph()
```
<!-- ````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span> -->
```{figure} ../imgs/bert_vis.png
---
width: 600px
name: bert_vis
---
```
<!-- ```` -->
<div class="admonition note">
<p class="title">**Suggestion**</p>
We can reference a module according to the graph easily:
```python
print(backbone_model.bert.encoder.layer[0].intermdiate)
```
When using opendelta on a new backbone model, it's better to first visualize the child module names (shown in white), and then designating the `modified_modules`.
</div>
## Now add a delta model and visualize the change.
```python
from opendelta import LowRankAdapterModel
delta_model = LowRankAdapterModel(backbone_model)
delta_model.freeze_module(exclude=["cls", "intermediate", "LayerNorm"])
Visualization(backbone_model).structure_graph()
```
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/bertdelta_vis.png
---
width: 600px
name: bertdelta_vis
---
```
````
(color_schema)=
<div class="admonition tip">
<div class="title">**Color Schema**</div>
<ul>
<li> The <span style="font-weight:bold;color:white;">white</span> part is the name of the module.</li>
<li> The <span style="font-weight:bold;color:green;">green</span> part is the module's type.</li>
<li> The <span style="font-weight:bold;color:blue;">blue</span> part is the tunable parameters, i.e., the parameters that require grad computation.</li>
<li> The <span style="font-weight:bold;color:grey;">grey</span> part is the frozen parameters, i.e., the parameters that do not require grad computation.</li>
<li> The <span style="font-weight:bold;color:red;">red</span> part is the structure that is repeated and thus folded.</li>
<li> The <span style="font-weight:bold;color:purple;">purple</span> part is the delta parameters inserted into the backbone model.</li>
</ul>
</div>
:::{admonition} PlatForm Sentivity
:class: warning
Depending on the platform the code is running on, the colors may vary slightly.
:::
## We also provide the option to visualize the nodes without parameters.
```python
Visualization(backbone_model).structure_graph(keep_non_params=True)
```
Thus, the modules like dropout and activations are kept.
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/bertdelta_noparam.png
---
width: 600px
name: bertdelta_noparam
---
```
````
:::{admonition} Order of the submodule
:class: warning
Currently, OpenDeltas Visualization visualize the model based on pytorch's named_modules method. That means the order of the presented submodule is the order they are add to the parent module, not necessarily the order that tensors flows through.
:::
# Inspect the optimizer

View File

@ -0,0 +1,31 @@
(installation)=
# Installation
The lasted version of OpenDelta is tested on on [Python 3.8](https://www.python.org/) and [Pytorch 1.12](<https://pytorch.org/>). Other versions are likely to be supported as well.
## install the lastest version
```bash
pip install git+https://github.com/thunlp/OpenDelta.git
```
## install the lastest pip version (more stable)
```bash
pip install opendelta
```
## build from source
```bash
git clone git@github.com:thunlp/OpenDelta.git
cd OpenDelta
```
then
```
python setup.py install
```
or if you want to do some modifications on the code for your research:
```
python setup.py develop
```

View File

@ -0,0 +1,68 @@
(keyfeature)=
# Philosophy and Key Features
:::{admonition} Plug-and-play Design.
:class: tip
Existing open-source project to propogate this **''delta-tuning''** paradigm includes
<a href="https://adapterhub.ml">AdapterHub</a>, which copies the transformers code base and modify on it, which makes it unintuitive to transfer from a normal code base to a delta-tuning ones.
OpenDelta approaches this problem via a **true plug-and-play** fashion to the PLMs. To migrate from a full-model finetuning training scripts to a delta tuning training scripts, you **DO NOT** need to change the backbone bone model code base to an adapted code base.
:::
Here is how we achieve it.
<img src="../imgs/pointing-right-finger.png" height="30px"> **Read through it will also help you to implement your own delta models in a sustainable way.**
## 1. Name-based submodule addressing.
See [name based addressing](namebasedaddr)
## 2. Three basic submodule-level delta operations.
We use three key functions to achieve the modifications to the backbone model outside the backbone model's code.
1. **unfreeze some paramters**
Some delta models will unfreeze a part of the model parameters and freeze other parts of the model, e.g. [BitFit](https://arxiv.org/abs/2106.10199). For these methods, just use [freeze_module](opendelta.basemodel.DeltaBase.freeze_module) method and pass the delta parts into `exclude`.
2. **replace an module**
Some delta models will replace a part of the model with a delta model, i.e., the hidden states will no longer go through the original submodules. This includes [Lora](https://arxiv.org/abs/2106.09685).
For these methods, we have an [update_module](opendelta.basemodel.DeltaBase.replace_module) interface.
3. **insertion to the backbone**
- **sequential insertion**
Most adapter model insert a new adapter layer after/before the original transformers blocks. For these methods, insert the adapter's forward function after/before the original layer's forward function using [insert_sequential_module](opendelta.basemodel.DeltaBase.insert_sequential_module) interface.
- **parallel insertion**
Adapters can also be used in a parallel fashion (see [Paper](https://arxiv.org/abs/2110.04366)).
For these methods, use [insert_parallel_module](opendelta.basemodel.DeltaBase.insert_parallel_module) interface.
:::{admonition} Doc-preserving Insertion
:class: note
In the insertion operations, the replaced forward function will inherit the doc strings of the original functions.
:::
## 3. Pseudo input to initialize.
Some delta models, especially the ones that is newly introduced into the backbone, will need to determine the parameters' shape. To get the shape, we pass a pseudo input to the backbone model and determine the shape of each delta layer according to the need of smooth tensor flow.
:::{admonition} Pseudo Input
:class: warning
Most models in [Huggingface Transformers](https://huggingface.co/docs/transformers/index) have an attribute [dummy_inputs](https://github.com/huggingface/transformers/blob/v4.16.2/src/transformers/modeling_utils.py#L464). This will create a nonsensical input with the correct format to pass into the model's forward function.
For the models that doesn't inherit/implement this attributes, we assume the pseudo input to the model is something like `input_id`, i.e., an integer tensor.
```python
pseudo_input = torch.tensor([[0,0,0]])
# or
pseudo_input = torch.tensor([0,0,0])
```
<img src="../imgs/todo-icon.jpeg" height="30px"> We will add interface to allow more pseudo input in the future.
:::

View File

@ -0,0 +1,185 @@
# Name-based Addressing
Named based addressing is what set OpenDelta apart from other packages and provide the possibility to be used to a broader range of models (even emerging ones).
## Name of a submodule.
We locate the submodules that we want to apply a delta layer via name-based addressing.
In pytorch fashion, a submodule can be accessed from a root model via 'dot' addressing. For example, we define a toy language model
```python
import torch.nn as nn
class MyNet1(nn.Module):
def __init__(self,):
super().__init__()
self.name_a = nn.Linear(5,5)
def forward(self, hiddens):
return self.name_a(hiddens)
class MyNet2(nn.Module):
def __init__(self,):
super().__init__()
self.embedding = nn.Embedding(10,5)
self.name_b = nn.Sequential(MyNet1(), MyNet1())
def forward(self, input_ids):
hiddens = self.embedding(input_ids)
return self.name_b(hiddens)
root = MyNet2()
print(root.name_b[0].name_a)
# Linear(in_features=5, out_features=5, bias=True)
```
We can visualize the model (For details, see [visualization](visualization))
```python
from bigmodelvis import Visualization
Visualization(root).structure_graph()
```
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/name_based_addressing.png
---
width: 500px
name: name_based_addressing
---
```
````
In this case, string `"name_b.0.name_a"` will be the name to address the submodule from the root model.
Thus when applying a delta model to this toy net.
```python
from opendelta import AdapterModel
AdapterModel(backbone_model=root, modified_modules=['name_b.0.name_a'])
Visualization(root).structure_graph()
```
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/toy-delta.png
---
width: 500px
name: toy-delta
---
```
````
(targetmodules)=
## Target modules.
For different delta methods, the operation for the modification target is different.
- Adapter based method: Insert at the target module's forward function.
- BitFit: Add bias to all allowed position of the target module.
- Lora: Substitute the all the linear layers of the target module with [Lora.Linear](https://github.com/microsoft/LoRA/blob/main/loralib/layers.py#L92).
- Prefix Tuning: the target module must be an attention module.
:::{admonition} Auto Searching
:class: note
We are working on unifying operations to automatically search within a given module for its submodules that can be applied using a specific delta method.
:::
## Makes addressing easier.
Handcrafting the full names of submodules can be frustrating. We made some simplifications
1. **End-matching** Rules.
OpenDelta will take every modules that
**ends with** the provided name suffix as the modification [target module](targetmodules).
:::{admonition} Example
:class: tip
Taking DistilBert with an classifier on top as an example:
- set to `["0.attention.out_lin"]` will add delta modules to the attention output of distilbert's
ayer 0, i.e., `distilbert.transformer.layer.0.attention.out_lin`.
- set to `["attention.out_lin"]` will add the delta modules in every layer's `attention.out_lin`.
:::
(regexexpr)=
2. Regular Expression.
We also support regex end-matching rules.
We use a beginning `[r]` followed by a regular expression to represent this rule, where `[r]` is used to distinguish it from normal string matching rules and has no other meanings.
Taking RoBERTa with an classifier on top as an example: It has two modules named `roberta.encoder.layer.0.attention.output.dense` and `roberta.encoder.layer.0.output.dense`, which both end up with `output.dense`. To distinguish them:
- set `'[r](\d)+\.output.dense'` using regex rules, where `(\d)+` match any layer numbers. This rule will match all `roberta.encoder.layer.$.output.dense`. where `$` represents all integer numbers, here in a 12-layer RoBERTa, it's 0-11.
- set `'[r][0-5]\.attention'` will match only the 0-5 layers' attention submodule.
- set `'attention.output.dense'` using ordinary rules, which only match `roberta.encoder.layer.0.attention.output.dense`.
:::{admonition} Regex in Json Configs
:class: warning
In json, you should write `"\\."` instead of `"\."` for a real dot due to json parsing rules. That is
```
{
...
"modified_moduls": ['[r][0-5]\\.attention'],
...
}
```
:::
3. Interactive Selection.
We provide a way to interact visually to select modules needed.
```python
from transformers import BertForMaskedLM
model = BertForMaskedLM.from_pretrained("bert-base-cased")
# suppose we load BERT
from opendelta import LoraModel # use lora as an example, others are same
delta_model = LoraModel(backbone_model=model, interactive_modify=True)
```
by setting `interactive_modify`, a web server will be opened on local host, and the link will be print in the terminal, e.g.,
```
http://0.0.0.0:8888/
```
If on your local machine, click to open the link for interactive modification.
If on remote host, you could use port mapping. For example, vscode terminal will automatically do port mapping for you, you can simply use `control/command + click` to open the link.
You can change the port number in case the default port number is occupied by other program by setting `interactive_modify=port_number`, in which port_number is an integer.
The web page looks like the following figure.
```{figure} ../imgs/interact.jpg
---
width: 500px
name: interact web page
---
```
- By clicking on `[+]`/`[-]` to expand / collapse tree nodes.
- By clicking on text to select tree nodes, **yellow dotted** box indicates the selection.
- **Double** click on the pink `[*]` is an advanced option to unfold the repeated nodes. By default, modules with the same architecture are folded into one node and are marked in red, for example, the `BertLayer` of layers 0~11 in the above figure are in the same structure. Regular model changes will make the same changes to each layers.
- If you want to change only a few of them, first double-click on `[*]`, then select the parts you want in the unfolded structure.
- If you want to make the same change to all but a few of them, first select the common parts you want in the folded structure, then double-click on `[*]` to remove the few positions you don't need to change in the expanded structure.
Click `submit` button on the top-right corner, then go back to your terminal, you can get a list of name-based addresses printed in the terminal in the following format, and these modules are being "delta".
```
modified_modules:
[bert.encoder.layer.0.output.dense, ..., bert.encoder.layer.11.output.dense]
```
## Examples
Nothing works better than a few lively examples.
Comming Soon...

View File

@ -0,0 +1,36 @@
# What is Delta-tuning and Why OpenDelta?
(WhatisDelta)=
:::{admonition} What is Delta?
:class: tip
As Pre-trained language models (PLMs) have become the fundamental infrastructure on many NLP tasks and benchmarks, it is becoming increasingly clear from recent research that **larger models tend to lead to better performance**. However, large-scale PLMs also bring prohibitive adaptation costs when fine-tuning all the parameters of a model and retaining separate instances for different tasks.
**Parameter-efficient model stimulation methods** thus have attracted researchers' eyes, which only tune a small fraction of model parameter while achieving comparable or even better performance than full-model fine-tuning, dubbed as "Delta-tuning".
**Delta** thus means a small fraction $\Delta\Theta$ of parameters besides the pretrained models $\Theta_0$.
\begin{gather*}
\Theta \sim \Theta_0\text{(frozen)} + \Delta\Theta\text{(tunable)}
\end{gather*}
This open-source project implement several delta-tuning methods, which allows researchers and engineers to quickly migrate their codes from full-model tuning to delta-tuning without replace the backend (the implementation of the backbone PLM).
:::
## Why OpenDelta?
- <span style="color:rgb(81, 217, 245);font-weight:bold">Clean:</span> No need to edit the backbone PTMs codes.
- <span style="color:orange;font-weight:bold">Simple:</span> Migrating from full-model tuning to delta-tuning needs as little as 3 lines of codes.
- <span style="color:green;font-weight:bold">Sustainable:</span> Most evolution in external library doesnt require a new OpenDelta.
- <span style="color:red;font-weight:bold">Extendable:</span> Various PTMs can share the same delta-tuning codes.
- <span style="color:purple;font-weight:bold">Flexible:</span> Able to apply delta-tuning to (almost) any position of the PTMs.
## Delta-tuning papers
<img src="../imgs/todo-icon.jpeg" height="30px">

View File

@ -0,0 +1,113 @@
# Multitask Modeling using OpenDelta
:::{admonition} Multitask Serving with Delta-tuning
:class: tip
A huge advange of Delta-tuning is that it can be used for multitask serving.
Imagine we have a pretrained model trained on a mix of data coming from multiple languages, e.g.,English, Chinese, and French. Now you want to have seperate models that specialise in Chinese, French, English. We can thus delta-tune three deltas on each language with small amount of additional language-specific data. During serving, when a Chinese sentence comes, you attach the "Chinese Delta", and next a French sentence comes, you detach the "Chinese Delta", and attach a "French Delta".
:::
**Here is how to achieve multitask serving using OpenDelta.**
```python
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base")
from opendelta import LoraModel
delta_model = LoraModel(backbone_model=model, modified_modules=['fc2'])
delta_model.log()
```
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/plugunplug1.png
---
width: 800px
name: plugunplug1
---
```
````
Now we detach the deltas from the backbone
```python
delta_model.detach()
delta_model.log()
```
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/plugunplug2.png
---
width: 800px
name: plugunplug2
---
```
````
We can reattach the deltas to the backbone
```python
delta_model.attach()
delta_model.log()
```
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/plugunplug3.png
---
width: 800px
name: plugunplug3
---
```
````
:::{admonition} Independence of Different Delta Models
:class: note
Different delta models will be independent in detaching and attaching.
(But the visualization will not show all deltas in the backbone model.)
```python
# continue from the above example
from opendelta import AdapterModel
delta_model2 = AdapterModel(backbone_model=model, modified_modules=['fc1'])
delta_model2.log()
```
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/plugunplug4.png
---
width: 800px
name: plugunplug4
---
```
````
detach the lora delta
```python
delta_model.detach() # detach the lora delta
delta_model.log()
```
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/plugunplug5.png
---
width: 800px
name: plugunplug5
---
```
````
detach the adapter delta and reattach the lora delta
```python
delta_model2.detach() # detach the adapter delta
delta_model.attach() # reattach the lora delta
delta_model.log()
```
````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
```{figure} ../imgs/plugunplug6.png
---
width: 800px
name: plugunplug6
---
```
````
:::
:::{admonition} BitFit not supported
:class: warning
<img src="../imgs/todo-icon.jpeg" height="30px"> Currently detach is not suitable for BitFit, which modify the requires_grad property. Please wait for future releases.
:::

View File

@ -0,0 +1,38 @@
(basics)=
# Quick Start
Now we introduce the most basic interface to migrate your full-model tuning scripts to a delta tuning one **on some commonly used PTMs or their derivative models** (the models that has the PTM as their submodule,e.g., BERTForSequenceClassification). [try in colab](https://colab.research.google.com/drive/1SB6W5B-2nKxOnkwHSIe3oGXZ7m53u_Vf?usp=sharing)
```diff
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-large-cased")
+ from opendelta import AdapterModel
+ delta_model = AdapterModel(model)
+ delta_model.freeze_module(exclude=["deltas", "classifier"]) # leave the delta tuning modules and the newly initialized classification head tunable.
+ # delta_model.log() # optional: to visualize how the `model` changes.
training_dataloader = get_dataloader()
optimizer, loss_function = get_optimizer_loss_function()
for batch in training_dataloader:
optimizer.zero_grad()
targets = batch.pop('labels')
outputs = model(**batch).logits
loss = loss_function(outputs, targets)
loss.backward()
optimizer.step()
print(loss)
- torch.save(model.state_dict(), "finetuned_bert.ckpt")
+ delta_model.save_finetuned("finetuned_bert")
```
We currently support the following models and their derivative models in their default configurations.
- BERT
- DeBERTa-v2
- GPT2
- OPT
- RoBERTa
- T5
For model not in the above list, please refer to more detailed [custom usage](custom).

View File

@ -0,0 +1,82 @@
(commonstructure)=
# Common Structure Mapping
```{figure} ../imgs/transformers_structure.png
:width: 400px
:name: transformers_structure
```
Although different PTMs often share similar Transformers structures, the codebases, and most importantly, the variable names for each submodule, are quite different.
On the one hand, we **encourage the users to first [visualize](visualization) the PTMs' structure and then determine the name of submoduels.**
On the other hand, we designed a unified name convention of Transformer Structure, and provided several structure mapping from the original name to the unified name convention.
In this section, we will illustrate the unified name convention and structure mapping.
## Common blocks in Transformers structure.
- embeddings (word embedding)
- encoder
- block
- $ (layer_id)
- attn
- q, k, v
- proj
- layer_norm
- ff
- w1
- w2
- layer_norm
- decoder (similar to encoder)
- lm_head
- proj
Visualize bert-base using a common structure name: The submodules that are not common are grey.
```{figure} ../imgs/commonstructure_vis.png
:width: 600px
:name: commonstructure_vis
```
(mappingexample)=
## Example
Example of bert mapping: a tree with node names specified by <span style="font-weight:bold;color:rgb(55, 125, 34);" >"\_\_name\_\_"</span>
```json
{
"bert.embeddings.word_embeddings": {"__name__":"embeddings"},
"bert.embeddings.position_embeddings": {"__name__":""},
"bert.embeddings.token_type_embeddings": {"__name__":""},
"bert.embeddings.LayerNorm": {"__name__":""},
"bert.encoder": {"__name__":"encoder",
"layer": {"__name__":"block",
"$": {"__name__":"$",
"attention": {"__name__":"attn",
"self.query": {"__name__":"q"},
"self.key": {"__name__":"k"},
"self.value": {"__name__":"v"},
"output.dense": {"__name__":"proj"},
"output.LayerNorm": {"__name__":"layer_norm"},
},
"output": {"__name__":"ff",
"dense": {"__name__":"w2"},
"LayerNorm": {"__name__":"layer_norm"}
},
"intermediate.dense": {"__name__":"ff.w1"},
}
}
},
"cls.predictions": {"__name__": "lm_head",
"transform.dense": {"__name__":""},
"transform.LayerNorm": {"__name__":""},
"decoder": {"__name__":"proj"},
}
}
```

View File

@ -0,0 +1,35 @@
# Update Logs and Known Issues
## Version 0.3.2
- We improve the docs.
- We support BMTrain to accelerate the training, and parallelize the training of models that are hard to fit in a single GPU. Check [tutorial/2_with_bmtrain.py](https://github.com/thunlp/OpenDelta/tree/main/examples/tutorial/2_with_bmtrain.py)
- We add a functionality to [inspect the optimizer](https://github.com/thunlp/OpenDelta/tree/main/opendelta/utils/inspect.py). The user can see the number of trainable parameters in the optimizer and verify that opendelta is being used correctly.
- We move the functions to inspect the delta models into [inspect.py](https://github.com/thunlp/OpenDelta/tree/main/opendelta/utils/inspect.py)
## Version 0.3.1
- We update [must_try.py](https://github.com/thunlp/OpenDelta/tree/main/examples/unittest/must_try.py) for a simple introduction of the core functionality of OpenDelta.
- Thanks to [Weilin Zhao](https://github.com/Achazwl) We merge a long-developed branch parallel_adapter into the main branch.
## Version 0.3.0
### Updates:
- Add this changelog for a granular record of updates.
- The default configuration of delta models can be applied to more wrapped models.
- There is less need to configure 'modified_modules' for wrapped models like [BertForSequenceClassification](https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification) or even [OpenMatch.DRModel](https://github.com/OpenMatch/OpenMatch/blob/master/src/openmatch/modeling/dense_retrieval_model.py#L37), as long as it has a model we support default configuration inside. **Note that if you customize `modified_modules` by yourself, most pytorch models are supported.**
- LoRA and BitFit models now does not need pseudo data to instantiate the model.
- BitFit models can now support [Conv1D](https://huggingface.co/docs/transformers/v4.23.1/en/internal/modeling_utils#transformers.Conv1D) using default configuration.
- Improve type hint for AutoDeltaModel.
- Fix bugs in documentation.
- Fix small bugs when saving a model without a config attributes.
- Make the default modified modules of adapter-like methods more accurate: attach the adapter-like modules after the output of attention layer and second feed-forward layer, both before the layernorm layers.
- A simple unit test folder containing development-time tests has been added for interested users.
### Known Issues
- SoftPrompt is still not supported for wrapped model if the model has no attribute `get_input_embeddings`.
- Prefix Tuning is still limited to T5, GPT2, Bart, Bert, Roberta.
## Version 0.2.4
### Updates
- examples/examples_seq2seq and examples/examples_text-classification is depreciated and moved to [legacy](https://github.com/thunlp/OpenDelta/tree/main/examples/legacies)
- Thanks to [Zhen Zhang](https://github.com/namezhenzhang), we provide [examples_prompt](https://github.com/thunlp/OpenDelta/tree/main/examples/examples_prompt), as a cleaner and more general framework, which unifies the delta tuning paradigm and the prompt-tuning paradigm. It is still based on [Huggingface Trainers](https://huggingface.co/docs/transformers/main_classes/trainer). In this example framework, the running pipeline is [a unified script](https://github.com/thunlp/OpenDelta/tree/main/examples/examples_prompt/src), the differences in tasks, models, delta tuning models, and even prompt-tuning paradigms are [more modular and be more independent ](https://github.com/thunlp/OpenDelta/tree/main/examples/examples_prompt/backbones). Please try it out!

View File

@ -0,0 +1,3 @@
# OpenDelta + Huggingface Accelerate
<img src="../imgs/todo-icon.jpeg" height="30px">

View File

@ -0,0 +1,12 @@
(acceleration)=
# OpenDelta + BMTrain
- [BMTrain](https://github.com/OpenBMB/BMTrain) is an efficient large model training toolkit that can be used to train large models with tens of billions of parameters. It can train models in a distributed manner while keeping the code as simple as stand-alone training.
- [ModelCenter](https://github.com/OpenBMB/ModelCenter) implements pre-trained language models (PLMs) based on the backend OpenBMB/BMTrain. ModelCenter supports Efficient, Low-Resource, Extendable model usage and distributed training.
Now we have the LoraModel, AdapterModel, CompacterModel, ParallelAdapterModel, LowRankAdapterModel fully supported the distributed training with BMTrain and ModelCenter.
Pass `backend='bmt'` in config or delta model initialization to enable `bmtrain`.

View File

@ -0,0 +1,25 @@
# Use Examples
This repo mainly contains several running scripts to use OpenDelta to conduct parameter-efficient training of various tasks.
**Note that we suggest adding OpenDelta to existing scripts, instead of modify a scripts into the following examples. OpenDelta itself doens't restrict the training pipeline nor provide pipeline.**
## tutorial
Several toy tutorials:
1. The scripts for docs/basic_usage
2. Using interactive module selection
3. Work with [OpenPrompt](https://github.com/thunlp/OpenPrompt)
## examples_text-classification
Modify a huggingface text-classification examples into a delta tuning one.
Currently, GLUE datasets are supported in the scripts. Roberta-base is used for performance checking. Read README.md inside the repo for detailed usage.
## examples_seq2seq
Modify a huggingface sequence to sequence examples into a delta tuning one.
Currently, SuperGLUE and GLUE datasets are supported in the scripts. T5-base is used for performance checking. Read README.md inside the repo for detailed usage.
## examples_image-classification
A toy example of using OpenDelta for a Computer Vision Pretrained Model (ViT). Since ViT is an experimental feature in huggingface transformers, this example is subject to Change at any moment.

View File

@ -0,0 +1,59 @@
# Examples of using opendelta together with 🤗 transformers.
In this repo, we construct a very general pipeline to train and test a PLM using
🤗 transformers.
The pipeline was constructed together with [openpromptu](https://pypi.org/project/openpromptu/), which is a light and
model-agnostic version of [openprompt](https://github.com/thunlp/OpenPrompt).
## Pool of PLMs
We are going to adapt most of the models in 🤗 transformers
in the repos. The different pipeline, processing, or configurations are specified
in `./backbones/`. You can add your own model in this file to support customized models.
### A example script to run the repo in offline mode
```bash
conda activate [YOURENV]
PATHBASE=[YOURPATH]
JOBNAME="adapter_t5-base"
DATASET="superglue-cb"
cd $PATHBASE/OpenDelta/examples/examples_prompt/
python configs/gen_t5.py --job $JOBNAME
export TRANSFORMERS_OFFLINE=1
export HF_DATASETS_OFFLINE=1
python src/run.py configs/$JOBNAME/$DATASET.json \
--model_name_or_path [YOURPATH_TO_T5_BASE] \
--tokenizer_name [YOURPATH_TO_T5_BASE] \
--datasets_saved_path [YOURPATH_TO_CB_DATASETS] \
--finetuned_delta_path ${PATHBASE}/delta_checkpoints/ \
--num_train_epochs 20 \
--bottleneck_dim 24 \
--delay_push True
```
## A example of quick testing the repo.
```bash
conda activate [YOURENV]
PATHBASE=[YOURPATH]
JOBNAME="adapter_t5-base"
DATASET="superglue-cb"
cd $PATHBASE/OpenDelta/examples/examples_prompt/
export TRANSFORMERS_OFFLINE=1
export HF_DATASETS_OFFLINE=1
export DELTACENTER_OFFLINE=0
python src/test.py configs/$JOBNAME/$DATASET.json \
--model_name_or_path [YOURPATH_TO_T5_BASE] \
--tokenizer_name [YOURPATH_TO_T5_BASE] \
--datasets_saved_path [YOURPATH_TO_CB_DATASETS] \
--finetuned_delta_path thunlp/t5-base_adapter_superglue-cb_20220701171436c80 \
--delta_cache_dir "./delta_checkpoints/" \
--force_download True
```

View File

@ -0,0 +1,179 @@
from openpromptu.data_utils import InputExample
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
from transformers import (
AutoConfig,
AutoModelForSeq2SeqLM,
AutoTokenizer,
)
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
import torch
def mask_token_func(tokenizer, ith_mask=0):
return tokenizer.mask_token
def get_remove_columns(dataset_features):
return dataset_features
def preprocess_function(raw_example, **kwargs):
# max_target_length += 1
tokenizer = kwargs['tokenizer']
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
split = kwargs['split']
example = InputExample(**raw_example)
example = verbalizer.wrap_one_example(example)
example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
model_inputs = tokenizer(input_sentence, max_length=256,
padding="max_length", truncation=True)
with tokenizer.as_target_tokenizer():
label = tokenizer(other['tgt_text']).input_ids
model_inputs["labels"] = label
return model_inputs
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
# model_args.config_name if model_args.config_name else model_args.model_name_or_path,
model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
config.dropout_rate = 0.0
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
return config, tokenizer, model
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
from openpromptu.prompts import GenerationVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
return template, verbalizer, tokenizer_wrapper
class Trainer(HfSeq2SeqTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.eval_task = eval_task
self.compute_metrics = self._compute_metrics
def compute_loss(self, model, inputs, return_outputs=False):
outputs = model(**inputs)
if return_outputs:
return (outputs.loss, outputs)
else:
return outputs.loss
def prediction_step(
self,
model, #nn.Module,
inputs, #Dict[str, Union[torch.Tensor, Any]],
prediction_loss_only, #: bool,
ignore_keys, #: Optional[List[str]] = None,
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
"""
Perform an evaluation step on :obj:`model` using obj:`inputs`.
Subclass and override to inject custom behavior.
Args:
model (:obj:`nn.Module`):
The model to evaluate.
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
The inputs and targets of the model.
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
prediction_loss_only (:obj:`bool`):
Whether or not to return the loss only.
Return:
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
labels (each being optional).
"""
if not self.args.predict_with_generate or prediction_loss_only:
return super().prediction_step(
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
)
has_labels = "labels" in inputs
inputs = self._prepare_inputs(inputs)
gen_kwargs = {
"max_length": 10, # self._max_length if s is not None else self.model.config.max_length,
"num_beams": 1 #self._num_beams if self._num_beams is not None else self.model.config.num_beams,
}
generated_tokens = self.model.generate(
inputs["input_ids"],
attention_mask=inputs["attention_mask"],
**gen_kwargs,
)
# in case the batch is shorter than max length, the output should be padded
if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
with torch.no_grad():
outputs = model(**inputs)
if has_labels:
if self.label_smoother is not None:
loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
else:
loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
else:
loss = None
if self.args.prediction_loss_only:
return (loss, None, None)
labels = inputs["labels"]
if labels.shape[-1] < gen_kwargs["max_length"]:
labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
# from IPython import embed; embed(header="In seqseqtrainer")
return (loss, generated_tokens, labels)
def _compute_metrics(self, eval_preds):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds
decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
# post_processor = .get(data_args.dataset_name[0], tokenizer,
# data_args.ignore_pad_token_for_loss)
# decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info)
result = {}
for metric in self.eval_task.metric:
result.update(metric(decoded_preds, decoded_labels))
average_metric = sum(result.values())/len(result)
result.update({"average_metrics":average_metric})
return result

View File

@ -0,0 +1,140 @@
from openpromptu.data_utils import InputExample
import torch
from transformers.data.data_collator import torch_default_data_collator
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
import numpy as np
from transformers import (
AutoConfig,
AutoFeatureExtractor,
AutoModelForImageClassification,
)
from transformers import Trainer as HfTrainer
import torch.nn as nn
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
# from openpromptu.prompts import ManualVerbalizer
# from openpromptu.prompts import ManualTemplate
# from openpromptu import TokenizerWrapper
# template = ManualTemplate(text = task.templates_text[template_id])
# verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
# tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
return None, None, None
def preprocess_function(raw_example, **kwargs):
# from IPython import embed; embed(header="Therefa")
tokenizer = kwargs['tokenizer']
# print(np.array(raw_example['img']).shape)
model_inputs = tokenizer(np.array(raw_example['image']), return_tensors='pt')
model_inputs['pixel_values'] = model_inputs['pixel_values'].squeeze()
model_inputs['labels'] = raw_example['label']
return model_inputs
def compute_metrics(eval_preds, dataset_name, eval_metric):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in eval_metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
return result
def mask_token_func(tokenizer, ith_mask=0):
return tokenizer.mask_token
def get_remove_columns(dataset_features):
# dataset_features.pop("label")
# print("remove_columns: {}".format(dataset_features))
return dataset_features
class DataCollator(HfDataCollatorMixin):
def __init__(self, *args, **kwargs):
self.return_tensors='pt'
def torch_call(self, features):
# from IPython import embed; embed(header="in data collator")
a = torch_default_data_collator(features=features)
# from IPython import embed; embed(header="in data collator")
return a
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
config.dropout_rate = 0.0
tokenizer = AutoFeatureExtractor.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForImageClassification.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
config.num_labels = model_args.num_classes
old_classifier = model.classifier
model.classifier = nn.Linear(old_classifier.in_features, config.num_labels)
return config, tokenizer, model
class Trainer(HfTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.verbalizer=verbalizer
self.eval_task=eval_task
self.compute_metrics = self._compute_metrics
self.loss_fn = nn.CrossEntropyLoss()
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.pop('labels')
outputs = model(**inputs)
logits = outputs.get("logits")
loss = self.loss_fn(logits, labels)
return (loss, outputs) if return_outputs else loss
def _compute_metrics(self, eval_preds):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in self.eval_task.metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
from IPython import embed; embed(header="In compute metrics")
return result

View File

@ -0,0 +1,142 @@
from openpromptu.data_utils import InputExample
import torch
from transformers.data.data_collator import torch_default_data_collator
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
import numpy as np
from transformers import (
AutoConfig,
AutoModelForMaskedLM,
AutoTokenizer,
)
from transformers import Trainer as HfTrainer
def preprocess_function(raw_example, **kwargs):
tokenizer = kwargs['tokenizer']
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
example = InputExample(**raw_example)
example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
padding="max_length", truncation=True)
return model_inputs
def compute_metrics(eval_preds, dataset_name, eval_metric):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in eval_metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
return result
def mask_token_func(tokenizer, ith_mask=0):
return tokenizer.mask_token
def get_remove_columns(dataset_features):
dataset_features.remove("label")
return dataset_features
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
from openpromptu.prompts import ManualVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
# from IPython import embed; embed()
return template, verbalizer, tokenizer_wrapper
class DataCollator(HfDataCollatorMixin):
def __init__(self, *args, **kwargs):
self.return_tensors='pt'
def torch_call(self, features):
return torch_default_data_collator(features=features)
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
config.dropout_rate = 0.0
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForMaskedLM.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model.resize_token_embeddings(len(tokenizer))
return config, tokenizer, model
class Trainer(HfTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.verbalizer=verbalizer
self.eval_task=eval_task
self.compute_metrics = self._compute_metrics
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.pop('labels')
outputs = model(**inputs)
logits = outputs.get("logits")
input_ids = inputs['input_ids']
verbalizer = self.verbalizer.cuda()
logits_at_mask = logits[torch.where(input_ids == verbalizer.tokenizer.mask_token_id)]
label_logits = verbalizer.process_logits(logits_at_mask)
loss_fct = torch.nn.CrossEntropyLoss()
loss = loss_fct(label_logits, labels)
outputs.logits = label_logits
return (loss, outputs) if return_outputs else loss
def _compute_metrics(self, eval_preds):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in self.eval_task.metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
return result

View File

@ -0,0 +1,143 @@
from openpromptu.data_utils import InputExample
import torch
from transformers.data.data_collator import torch_default_data_collator
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
import numpy as np
from transformers import (
AutoConfig,
AutoModelForMaskedLM,
AutoTokenizer,
)
from transformers import Trainer as HfTrainer
def preprocess_function(raw_example, **kwargs):
tokenizer = kwargs['tokenizer']
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
example = InputExample(**raw_example)
example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
padding="max_length", truncation=True)
return model_inputs
def compute_metrics(eval_preds, dataset_name, eval_metric):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in eval_metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
return result
def mask_token_func(tokenizer, ith_mask=0):
return tokenizer.mask_token
def get_remove_columns(dataset_features):
# from IPython import embed; embed(header="get_remove_columns")
dataset_features.remove("label")
return dataset_features
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
from openpromptu.prompts import ManualVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
# from IPython import embed; embed()
return template, verbalizer, tokenizer_wrapper
class DataCollator(HfDataCollatorMixin):
def __init__(self, *args, **kwargs):
self.return_tensors='pt'
def torch_call(self, features):
return torch_default_data_collator(features=features)
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
config.dropout_rate = 0.0
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForMaskedLM.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model.resize_token_embeddings(len(tokenizer))
return config, tokenizer, model
class Trainer(HfTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.verbalizer=verbalizer
self.eval_task=eval_task
self.compute_metrics = self._compute_metrics
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.pop('labels')
outputs = model(**inputs)
logits = outputs.get("logits")
input_ids = inputs['input_ids']
verbalizer = self.verbalizer.cuda()
logits_at_mask = logits[torch.where(input_ids == verbalizer.tokenizer.mask_token_id)]
label_logits = verbalizer.process_logits(logits_at_mask)
loss_fct = torch.nn.CrossEntropyLoss()
loss = loss_fct(label_logits, labels)
outputs.logits = label_logits
return (loss, outputs) if return_outputs else loss
def _compute_metrics(self, eval_preds):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in self.eval_task.metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
return result

View File

@ -0,0 +1,169 @@
from openpromptu.data_utils import InputExample
import torch
from transformers.data.data_collator import torch_default_data_collator
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
import numpy as np
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoTokenizer,
)
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
import copy
from torch.nn import CrossEntropyLoss
def preprocess_function(raw_example, **kwargs):
tokenizer = kwargs['tokenizer']
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
example = InputExample(**raw_example)
# example = verbalizer.wrap_one_example(example)
example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
padding="max_length", truncation=True)
return model_inputs
def compute_metrics(eval_preds, dataset_name, eval_metric):
pass
def mask_token_func(tokenizer, ith_mask=0):
return tokenizer.pad_token
def get_remove_columns(dataset_features):
# dataset_features.remove("label")
return dataset_features
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
from openpromptu.prompts import GenerationVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None)
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
return template, verbalizer, tokenizer_wrapper
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
# config.dropout_rate = 0.0
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
return config, tokenizer, model
class Trainer(HfSeq2SeqTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.eval_task = eval_task
self.compute_metrics = self._compute_metrics
def compute_loss(self, model, inputs, return_outputs=False):
labels=copy.deepcopy(inputs['input_ids'])
# labels[labels==self.tokenizer.pad_token_id]=-100
outputs = model(**inputs)
logits = outputs.logits
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1))
return (loss, outputs) if return_outputs else loss
def prediction_step(
self,
model, #nn.Module,
inputs, #Dict[str, Union[torch.Tensor, Any]],
prediction_loss_only, #: bool,
ignore_keys, #: Optional[List[str]] = None,
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
"""
Perform an evaluation step on :obj:`model` using obj:`inputs`.
Subclass and override to inject custom behavior.
Args:
model (:obj:`nn.Module`):
The model to evaluate.
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
The inputs and targets of the model.
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
prediction_loss_only (:obj:`bool`):
Whether or not to return the loss only.
Return:
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
labels (each being optional).
"""
if not self.args.predict_with_generate or prediction_loss_only:
return super().prediction_step(
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
)
inputs = self._prepare_inputs(inputs)
with torch.no_grad():
labels=copy.deepcopy(inputs['input_ids'])
# labels[labels==self.tokenizer.pad_token_id]=-100
outputs = model(**inputs)
logits = outputs.logits
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous().long()
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu()
loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss)
if prediction_loss_only:
return (loss, None, None)
else:
# non pad label
shift_labels = shift_labels.view(-1).detach().cpu()
nonpad_idx = shift_labels!=self.tokenizer.pad_token_id
shift_labels = shift_labels[nonpad_idx]
# the probability at the corresponding position
shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu()
target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device)
shift_logits = shift_logits.softmax(dim=-1)[target_position]
return (loss, shift_logits, shift_labels)
def _compute_metrics(self, eval_preds):
preds, labels = eval_preds
result = {}
for metric in self.eval_task.metric:
result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id))
average_metric = sum(result.values())/len(result)
result.update({"average_metrics":average_metric})
return result

View File

@ -0,0 +1,181 @@
from openpromptu.data_utils import InputExample
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
from transformers import (
AutoConfig,
BlenderbotForConditionalGeneration,
AutoTokenizer,
)
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
import torch
def mask_token_func(tokenizer, ith_mask=0):
return ""
def get_remove_columns(dataset_features):
return dataset_features
def preprocess_function(raw_example, **kwargs):
# max_target_length += 1
tokenizer = kwargs['tokenizer']
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
split = kwargs['split']
example = InputExample(**raw_example)
example = verbalizer.wrap_one_example(example)
example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
padding="max_length", truncation=True)
with tokenizer.as_target_tokenizer():
label = tokenizer(other['tgt_text']).input_ids
model_inputs["labels"] = label
# from IPython import embed; embed()
return model_inputs
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
config.dropout_rate = 0.0
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = BlenderbotForConditionalGeneration.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
# from IPython import embed; embed()
return config, tokenizer, model
def get_prompts(task, tokenizer, data_args, template_id="blenderbot", verbalizer_id="blenderbot"):
from openpromptu.prompts import GenerationVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
return template, verbalizer, tokenizer_wrapper
class Trainer(HfSeq2SeqTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.eval_task = eval_task
self.compute_metrics = self._compute_metrics
def compute_loss(self, model, inputs, return_outputs=False):
# from IPython import embed; embed()
outputs = model(**inputs)
if return_outputs:
return (outputs.loss, outputs)
else:
return outputs.loss
def prediction_step(
self,
model, #nn.Module,
inputs, #Dict[str, Union[torch.Tensor, Any]],
prediction_loss_only, #: bool,
ignore_keys, #: Optional[List[str]] = None,
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
"""
Perform an evaluation step on :obj:`model` using obj:`inputs`.
Subclass and override to inject custom behavior.
Args:
model (:obj:`nn.Module`):
The model to evaluate.
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
The inputs and targets of the model.
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
prediction_loss_only (:obj:`bool`):
Whether or not to return the loss only.
Return:
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
labels (each being optional).
"""
if not self.args.predict_with_generate or prediction_loss_only:
return super().prediction_step(
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
)
has_labels = "labels" in inputs
inputs = self._prepare_inputs(inputs)
gen_kwargs = {
"max_length": 10, # self._max_length if s is not None else self.model.config.max_length,
"num_beams": 1, #self._num_beams if self._num_beams is not None else self.model.config.num_beams,
"min_length": 1 # for blenderbot, generally we set it to be a large number. But in classification, we set it to 1
}
generated_tokens = self.model.generate(
inputs["input_ids"],
attention_mask=inputs["attention_mask"],
**gen_kwargs,
)
# in case the batch is shorter than max length, the output should be padded
if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
with torch.no_grad():
outputs = model(**inputs)
if has_labels:
if self.label_smoother is not None:
loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
else:
loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
else:
loss = None
if self.args.prediction_loss_only:
return (loss, None, None)
labels = inputs["labels"]
if labels.shape[-1] < gen_kwargs["max_length"]:
labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
# from IPython import embed; embed(header="In seqseqtrainer")
return (loss, generated_tokens, labels)
def _compute_metrics(self, eval_preds):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds
decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
# post_processor = .get(data_args.dataset_name[0], tokenizer,
# data_args.ignore_pad_token_for_loss)
# decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info)
result = {}
for metric in self.eval_task.metric:
result.update(metric(decoded_preds, decoded_labels))
average_metric = sum(result.values())/len(result)
result.update({"average_metrics":average_metric})
return result

View File

@ -0,0 +1,172 @@
from openpromptu.data_utils import InputExample
import torch
from transformers.data.data_collator import torch_default_data_collator
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
import numpy as np
from transformers import (
CLIPConfig,
CLIPProcessor,
CLIPModel,
)
from transformers import ViTFeatureExtractor
from PIL import Image
from transformers import Trainer as HfTrainer
import torch.nn as nn
def get_prompts(task, tokenizer, data_args, template_id="clip", verbalizer_id="clip"):
from openpromptu.prompts import GenerationVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer.tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
return template, verbalizer, tokenizer_wrapper
def mask_token_func(tokenizer, ith_mask=0):
return tokenizer.mask_token
def preprocess_function(raw_example, **kwargs):
# from IPython import embed; embed(header="Therefa")
tokenizer = kwargs['tokenizer']
# ["a photo of {}" for i in range()]
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
example = InputExample(raw_example)
texts = []
for candidate_label in range(verbalizer.num_classes):
tgt_text = verbalizer.wrap_one_example(label=candidate_label)
wrapped_example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(wrapped_example, tgt_texts=[tgt_text])
texts.append(input_sentence)
# from IPython import embed; embed()/
image = Image.open(raw_example['image_file_path'])
model_inputs = tokenizer(images=image, text=texts, max_length=16, padding="max_length", truncation=True, return_tensors='pt')
# from IPython import embed; embed()
model_inputs["pixel_values"] = model_inputs["pixel_values"].squeeze()
model_inputs["label"] = example.label
return model_inputs
def compute_metrics(eval_preds, dataset_name, eval_metric):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in eval_metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
return result
def get_remove_columns(dataset_features):
# from IPython import embed; embed(header="in remoev")
dataset_features.remove("labels")
print("remove_columns: {}".format(dataset_features))
return dataset_features
class DataCollator(HfDataCollatorMixin):
def __init__(self, *args, **kwargs):
self.return_tensors='pt'
def torch_call(self, features):
# from IPython import embed; embed(header="in data collator")
a = torch_default_data_collator(features=features)
# from IPython import embed; embed(header="in data collator")
a["input_ids"] = a["input_ids"][0]
a["attention_mask"] = a["attention_mask"][0]
return a
def get_backbone(model_args, **kwargs):
config = CLIPConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
# config.dropout_rate = 0.0
tokenizer = CLIPProcessor.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = CLIPModel.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
# config.num_labels = model_args.num_classes
# old_classifier = model.classifier
# model.classifier = nn.Linear(old_classifier.in_features, config.num_labels)
return config, tokenizer, model
class Trainer(HfTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.verbalizer=verbalizer
self.eval_task=eval_task
self.compute_metrics = self._compute_metrics
self.loss_fn = nn.CrossEntropyLoss()
def compute_loss(self, model, inputs, return_outputs=False):
# from IPython import embed; embed()
labels = inputs.pop('labels')
outputs = model(**inputs)
# logits = outputs.get("logits")
logits_per_image = outputs.logits_per_image
loss = self.loss_fn(logits_per_image, labels)
return (loss, outputs) if return_outputs else loss
def _compute_metrics(self, eval_preds):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in self.eval_task.metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
from IPython import embed; embed(header="In compute metrics")
return result

View File

@ -0,0 +1,171 @@
from openpromptu.data_utils import InputExample
import torch
from transformers.data.data_collator import torch_default_data_collator
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
import numpy as np
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoTokenizer,
)
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
import copy
from torch.nn import CrossEntropyLoss
def preprocess_function(raw_example, **kwargs):
tokenizer = kwargs['tokenizer']
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
example = InputExample(**raw_example)
# example = verbalizer.wrap_one_example(example)
example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
padding="max_length", truncation=True)
return model_inputs
def compute_metrics(eval_preds, dataset_name, eval_metric):
pass
def mask_token_func(tokenizer, ith_mask=0):
return tokenizer.pad_token
def get_remove_columns(dataset_features):
# dataset_features.remove("label")
return dataset_features
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
from openpromptu.prompts import GenerationVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None)
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="tail", mask_token_func=mask_token_func)
return template, verbalizer, tokenizer_wrapper
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
# config.dropout_rate = 0.0
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
if not hasattr(tokenizer,"pad_token") or (hasattr(tokenizer,"pad_token") and tokenizer.pad_token==None):
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
return config, tokenizer, model
class Trainer(HfSeq2SeqTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.eval_task = eval_task
self.compute_metrics = self._compute_metrics
def compute_loss(self, model, inputs, return_outputs=False):
labels=copy.deepcopy(inputs['input_ids'])
# labels[labels==self.tokenizer.pad_token_id]=-100
outputs = model(**inputs)
logits = outputs.logits
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1))
return (loss, outputs) if return_outputs else loss
def prediction_step(
self,
model, #nn.Module,
inputs, #Dict[str, Union[torch.Tensor, Any]],
prediction_loss_only, #: bool,
ignore_keys, #: Optional[List[str]] = None,
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
"""
Perform an evaluation step on :obj:`model` using obj:`inputs`.
Subclass and override to inject custom behavior.
Args:
model (:obj:`nn.Module`):
The model to evaluate.
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
The inputs and targets of the model.
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
prediction_loss_only (:obj:`bool`):
Whether or not to return the loss only.
Return:
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
labels (each being optional).
"""
if not self.args.predict_with_generate or prediction_loss_only:
return super().prediction_step(
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
)
inputs = self._prepare_inputs(inputs)
with torch.no_grad():
labels=copy.deepcopy(inputs['input_ids'])
# labels[labels==self.tokenizer.pad_token_id]=-100
outputs = model(**inputs)
logits = outputs.logits
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous().long()
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu()
loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss)
if prediction_loss_only:
return (loss, None, None)
else:
# non pad label
shift_labels = shift_labels.view(-1).detach().cpu()
nonpad_idx = shift_labels!=self.tokenizer.pad_token_id
shift_labels = shift_labels[nonpad_idx]
# the probability at the corresponding position
shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu()
target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device)
shift_logits = shift_logits.softmax(dim=-1)[target_position]
return (loss, shift_logits, shift_labels)
def _compute_metrics(self, eval_preds):
preds, labels = eval_preds
result = {}
for metric in self.eval_task.metric:
result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id))
average_metric = sum(result.values())/len(result)
result.update({"average_metrics":average_metric})
return result

View File

@ -0,0 +1,177 @@
from openpromptu.data_utils import InputExample
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
from transformers import (
AutoConfig,
AutoModelForSeq2SeqLM,
AutoTokenizer,
)
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
import torch
def mask_token_func(tokenizer, ith_mask):
return tokenizer.additional_special_tokens[ith_mask]
def get_remove_columns(dataset_features):
return dataset_features
def preprocess_function(raw_example, **kwargs):
# max_target_length += 1
tokenizer = kwargs['tokenizer']
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
split = kwargs['split']
example = InputExample(**raw_example)
example = verbalizer.wrap_one_example(example)
example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
model_inputs = tokenizer(input_sentence, max_length=256,
padding="max_length", truncation=True)
with tokenizer.as_target_tokenizer():
label = tokenizer(other['tgt_text']).input_ids
model_inputs["labels"] = label
return model_inputs
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
config.dropout_rate = 0.0
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
return config, tokenizer, model
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
from openpromptu.prompts import GenerationVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
return template, verbalizer, tokenizer_wrapper
class Trainer(HfSeq2SeqTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.eval_task = eval_task
self.compute_metrics = self._compute_metrics
def compute_loss(self, model, inputs, return_outputs=False):
outputs = model(**inputs)
if return_outputs:
return (outputs.loss, outputs)
else:
return outputs.loss
def prediction_step(
self,
model, #nn.Module,
inputs, #Dict[str, Union[torch.Tensor, Any]],
prediction_loss_only, #: bool,
ignore_keys, #: Optional[List[str]] = None,
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
"""
Perform an evaluation step on :obj:`model` using obj:`inputs`.
Subclass and override to inject custom behavior.
Args:
model (:obj:`nn.Module`):
The model to evaluate.
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
The inputs and targets of the model.
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
prediction_loss_only (:obj:`bool`):
Whether or not to return the loss only.
Return:
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
labels (each being optional).
"""
if not self.args.predict_with_generate or prediction_loss_only:
return super().prediction_step(
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
)
has_labels = "labels" in inputs
inputs = self._prepare_inputs(inputs)
gen_kwargs = {
"max_length": 10, # self._max_length if s is not None else self.model.config.max_length,
"num_beams": 1 #self._num_beams if self._num_beams is not None else self.model.config.num_beams,
}
generated_tokens = self.model.generate(
inputs["input_ids"],
attention_mask=inputs["attention_mask"],
**gen_kwargs,
)
# in case the batch is shorter than max length, the output should be padded
if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
with torch.no_grad():
outputs = model(**inputs)
if has_labels:
if self.label_smoother is not None:
loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
else:
loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
else:
loss = None
if self.args.prediction_loss_only:
return (loss, None, None)
labels = inputs["labels"]
if labels.shape[-1] < gen_kwargs["max_length"]:
labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
# from IPython import embed; embed(header="In seqseqtrainer")
return (loss, generated_tokens, labels)
def _compute_metrics(self, eval_preds):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds
decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
# post_processor = .get(data_args.dataset_name[0], tokenizer,
# data_args.ignore_pad_token_for_loss)
# decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info)
result = {}
for metric in self.eval_task.metric:
result.update(metric(decoded_preds, decoded_labels))
average_metric = sum(result.values())/len(result)
result.update({"average_metrics":average_metric})
return result

View File

@ -0,0 +1,48 @@
{
"bottleneck_dim": 24,
"dataset_config_name": [
"en"
],
"delta_type": "adapter",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "beans",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32",
"num_classes": 3,
"num_train_epochs": 20,
"output_dir": "outputs/adapter/clip-vit-base-patch32/beans",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_delta_center": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "beans",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "beans",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0
}

View File

@ -0,0 +1,53 @@
{
"backbone_model": "opt",
"bottleneck_dim": 24,
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "adapter",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "wikitext",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps":2,
"greater_is_better": false,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 900,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m",
"model_path_public": "opt-350m",
"num_train_epochs": 3,
"output_dir": "outputs/adapter/opt-350m/wikitext",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 6,
"per_device_train_batch_size": 6,
"predict_with_generate": true,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "wikitext",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "wikitext",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["self_attn"]
}

View File

@ -0,0 +1,53 @@
{
"backbone_model": "vit",
"bottleneck_dim": 24,
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": false,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "adapter",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "beans",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k",
"model_path_public": "vit-large-patch16-224-in21k",
"num_classes": 3,
"num_train_epochs": 20,
"output_dir": "outputs/adapter/vit-large-patch16-224-in21k/beans",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": false,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "beans",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "beans",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["output"]
}

View File

@ -0,0 +1,51 @@
{
"backbone_model": "t5-large",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "rte",
"eval_steps": 100,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/t5-large",
"model_path_public": "t5-large",
"num_train_epochs": 20,
"output_dir": "outputs/bitfit/t5-large/rte",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": true,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 100,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "rte",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "rte",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/t5-large",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["attn", "ff", "layer_norm"]
}

View File

@ -0,0 +1,66 @@
{
"backbone_model": "blenderbot",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "compacter",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "sst2",
"eval_steps": 200,
"evaluation_strategy": "steps",
"factorized_phm": true,
"factorized_phm_rule": false,
"gradient_clip": false,
"greater_is_better": true,
"hypercomplex_adapters": true,
"hypercomplex_division": 4,
"hypercomplex_nonlinearity": "glorot-uniform",
"learn_phm": true,
"learning_rate": 0.003,
"load_best_model_at_end": true,
"max_source_length": 128,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b",
"model_path_public": "blenderbot-3b",
"non_linearity": "gelu_new",
"normalize_phm_weight": false,
"num_train_epochs": 3,
"output_dir": "outputs/compacter/blenderbot-3b/sst2",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"phm_c_init": "normal",
"phm_clamp": false,
"phm_init_range": 0.0001,
"predict_with_generate": true,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"shared_phm_rule": false,
"split_validation_test": true,
"task_name": "sst2",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "sst2",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"use_bias_down_sampler": true,
"use_bias_up_sampler": true,
"warmup_steps": 0,
"modified_modules":["fc2"]
}

View File

@ -0,0 +1,51 @@
{
"backbone_model": "deberta-v2-xlarge",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "compacter",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mnli",
"eval_steps": 500,
"evaluation_strategy": "steps",
"greater_is_better": true,
"is_seq2seq": false,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge",
"num_train_epochs": 3,
"output_dir": "outputs/compacter/deberta-v2-xlarge/mnli",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": false,
"push_to_dc": true,
"push_to_hub": false,
"save_steps": 500,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "mnli",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mnli",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["attention"]
}

View File

@ -0,0 +1,51 @@
{
"backbone_model": "long-t5",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "compacter",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "rte",
"eval_steps": 100,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large",
"model_path_public": "long-t5-tglobal-large",
"num_train_epochs": 20,
"output_dir": "outputs/compacter/long-t5-tglobal-large/rte",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": true,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 100,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "rte",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "rte",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["attn", "ff", "layer_norm"]
}

View File

@ -0,0 +1,51 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
parser.add_argument("--")
args = parser.parse_args()
if __name__ == "__main__":
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,116 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
#### ROBERTA######
BaseConfigs['albert-xlarge-v2'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}albert-xlarge-v2",
"tokenizer_name": f"{PATHBASE}albert-xlarge-v2",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['prefix_albert-xlarge-v2'] = copy.deepcopy(BaseConfigs['albert-xlarge-v2'])
AllConfigs['prefix_albert-xlarge-v2'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/albert-xlarge-v2/",
})
AllConfigs['soft_prompt_albert-xlarge-v2'] = copy.deepcopy(BaseConfigs['albert-xlarge-v2'])
AllConfigs['soft_prompt_albert-xlarge-v2'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/albert-xlarge-v2/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,261 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
# PATHBASE=""
AllConfigs = {}
BaseConfigs = {}
BaseConfigs['bart-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}bart-base",
"tokenizer_name": f"{PATHBASE}bart-base",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['bitfit_bart-base'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/bart-base/",
})
AllConfigs['adapter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['adapter_bart-base'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/bart-base/",
})
AllConfigs['parallel_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['parallel_adapter_t5-base'].update({
"delta_type": "parallel_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/parallel_adapter/t5-base/",
})
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['lora_t5-base'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"modified_modules": [
"q_proj",
"v_proj",
],
"lora_r": 8,
"output_dir": "outputs/lora/bart-base/",
})
AllConfigs['compacter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['compacter_bart-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter/bart-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['compacter++_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['compacter++_bart-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"do_train": True,
"do_eval": True,
"do_test": True,
"modified_modules": [
"DenseReluDense"
],
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter++/bart-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['low_rank_adapter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['low_rank_adapter_bart-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/bart-base/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['soft_prompt_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['soft_prompt_bart-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-2,
"soft_token_num":100,
"token_init": False,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/bart-base/",
})
AllConfigs['prefix_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['prefix_bart-base'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/bart-base/",
})
AllConfigs['soft_prompt_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['soft_prompt_bart-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/bart-base/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,250 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
BaseConfigs['beit-base-patch16-224'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "num_classes"): zip(
["beans"],
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20],
[256],
[ 32],
[ 32],#, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0], # *7 +[0] *8,
[200],# 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200],#, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[ 3],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}beit-base-patch16-224",
"tokenizer_name": f"{PATHBASE}beit-base-patch16-224",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps",
"datasets_load_from_disk":False,
}
AllConfigs['bitfit_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['bitfit_beit-base-patch16-224'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/beit-base-patch16-224/",
})
AllConfigs['adapter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['adapter_beit-base-patch16-224'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/beit-base-patch16-224/",
})
AllConfigs['lora_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['lora_beit-base-patch16-224'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layernorm_after",
"classifier"
],
"modified_modules":[
"query",
"value",
],
"lora_r": 8,
"output_dir": "outputs/lora/beit-base-patch16-224/",
})
AllConfigs['compacter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['compacter_beit-base-patch16-224'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter/beit-base-patch16-224/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['compacter++_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['compacter++_beit-base-patch16-224'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"do_train": True,
"do_eval": True,
"do_test": True,
"modified_modules": [
"DenseReluDense"
],
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter++/beit-base-patch16-224/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['low_rank_adapter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['low_rank_adapter_beit-base-patch16-224'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/beit-base-patch16-224/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['soft_prompt_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['soft_prompt_beit-base-patch16-224'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-2,
"soft_token_num":100,
"token_init": False,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/beit-base-patch16-224/",
})
AllConfigs['prefix_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['prefix_beit-base-patch16-224'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/beit-base-patch16-224/",
})
AllConfigs['soft_prompt_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['soft_prompt_beit-base-patch16-224'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/beit-base-patch16-224/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,125 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
# PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
#### ROBERTA######
BaseConfigs['bert-base-cased'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}bert-base-cased",
"tokenizer_name": f"{PATHBASE}bert-base-cased",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps",
"datasets_load_from_disk": True,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/"
}
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['prefix_bert-base-cased'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/bert-base-cased/",
})
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['soft_prompt_bert-base-cased'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/bert-base-cased/",
})
AllConfigs['prefix_bert-large-cased'] = copy.deepcopy(AllConfigs['prefix_bert-base-cased'])
AllConfigs['prefix_bert-large-cased'].update({
"output_dir": "outputs/prefix/bert-large-cased/",
"model_name_or_path": f"{PATHBASE}bert-large-cased",
"tokenizer_name": f"{PATHBASE}bert-large-cased",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,147 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
#### ROBERTA ######
BaseConfigs['bigbird-roberta-large'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}bigbird-roberta-large",
"tokenizer_name": f"{PATHBASE}bigbird-roberta-large",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
AllConfigs['bitfit_bigbird-roberta-large'].update({
"delta_type": "bitfit",
"learning_rate": 1e-3,
"output_dir": "outputs/bitfit/bigbird-roberta-large/",
})
AllConfigs['none_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
AllConfigs['none_bigbird-roberta-large'].update({
"delta_type": "none",
"learning_rate": 1e-5,
"output_dir": "outputs/none/bigbird-roberta-large/",
})
AllConfigs['lora_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
AllConfigs['lora_bigbird-roberta-large'].update({
"delta_type": "lora",
"learning_rate": 1e-3,
"modified_modules": [
"query",
"key",
],
"output_dir": "outputs/lora/bigbird-roberta-large/",
})
AllConfigs['adapter_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
AllConfigs['adapter_bigbird-roberta-large'].update({
"delta_type": "adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/adapter/bigbird-roberta-large/",
})
AllConfigs['low_rank_adapter_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
AllConfigs['low_rank_adapter_bigbird-roberta-large'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/low_rank_adapter/bigbird-roberta-large/",
})
AllConfigs['soft_prompt_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
AllConfigs['soft_prompt_bigbird-roberta-large'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/bigbird-roberta-large/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,254 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
BaseConfigs['blenderbot-400M-distill'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}blenderbot-400M-distill",
"tokenizer_name": f"{PATHBASE}blenderbot-400M-distill",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['bitfit_blenderbot-400M-distill'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/blenderbot-400M-distill/",
})
AllConfigs['adapter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['adapter_blenderbot-400M-distill'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/blenderbot-400M-distill/",
})
AllConfigs['lora_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['lora_blenderbot-400M-distill'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"modified_modules":[
"q_proj",
"v_proj",
],
"lora_r": 8,
"output_dir": "outputs/lora/blenderbot-400M-distill/",
})
AllConfigs['compacter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['compacter_blenderbot-400M-distill'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter/blenderbot-400M-distill/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['compacter++_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['compacter++_blenderbot-400M-distill'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"do_train": True,
"do_eval": True,
"do_test": True,
"modified_modules": [
"DenseReluDense"
],
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter++/blenderbot-400M-distill/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['low_rank_adapter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['low_rank_adapter_blenderbot-400M-distill'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/blenderbot-400M-distill/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['none_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['none_blenderbot-400M-distill'].update({
"delta_type": "none",
"learning_rate": 1e-5,
"output_dir": "outputs/none/blenderbot-400M-distill/",
})
AllConfigs['soft_prompt_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['soft_prompt_blenderbot-400M-distill'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-2,
"soft_token_num":100,
"token_init": False,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/blenderbot-400M-distill/",
})
AllConfigs['prefix_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['prefix_blenderbot-400M-distill'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/blenderbot-400M-distill/",
})
AllConfigs['soft_prompt_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['soft_prompt_blenderbot-400M-distill'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/blenderbot-400M-distill/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,303 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
# PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
BaseConfigs['clip-vit-base-patch32'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "num_classes"): zip(
["beans"],
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20],
[256],
[ 32],
[ 32],#, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0], # *7 +[0] *8,
[200],# 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200],#, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[ 3],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}clip-vit-base-patch32",
"tokenizer_name": f"{PATHBASE}clip-vit-base-patch32",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['bitfit_clip-vit-base-patch32'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/clip-vit-base-patch32/",
})
AllConfigs['none_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['none_clip-vit-base-patch32'].update({
"delta_type": "none",
"learning_rate": 1e-5,
"output_dir": "outputs/none/clip-vit-base-patch32/",
})
AllConfigs['adapter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['adapter_clip-vit-base-patch32'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/clip-vit-base-patch32/",
})
AllConfigs['lora_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['lora_clip-vit-base-patch32'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/clip-vit-base-patch32/",
})
AllConfigs['compacter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['compacter_clip-vit-base-patch32'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter/clip-vit-base-patch32/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['compacter++_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['compacter++_clip-vit-base-patch32'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"do_train": True,
"do_eval": True,
"do_test": True,
"modified_modules": [
"DenseReluDense"
],
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter++/clip-vit-base-patch32/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['low_rank_adapter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['low_rank_adapter_clip-vit-base-patch32'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/clip-vit-base-patch32/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['soft_prompt_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['soft_prompt_clip-vit-base-patch32'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-2,
"soft_token_num":100,
"token_init": False,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/clip-vit-base-patch32/",
})
AllConfigs['prefix_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['prefix_clip-vit-base-patch32'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/clip-vit-base-patch32/",
})
AllConfigs['soft_prompt_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['soft_prompt_clip-vit-base-patch32'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/clip-vit-base-patch32/",
})
#### clip-vit-base-patch32
BaseConfigs['t5-small'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}t5-small",
"tokenizer_name": f"{PATHBASE}t5-small",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
AllConfigs['prefix_t5-small'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/t5-small/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,433 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
BaseConfigs['t5-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}t5-base",
"tokenizer_name": f"{PATHBASE}t5-base",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['bitfit_t5-base'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-base/",
})
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['adapter_t5-base'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/t5-base/",
})
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['lora_t5-base'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-base/",
})
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['compacter_t5-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter/t5-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['compacter++_t5-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"do_train": True,
"do_eval": True,
"do_test": True,
"modified_modules": [
"DenseReluDense"
],
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter++/t5-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['low_rank_adapter_t5-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/t5-base/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['soft_prompt_t5-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-2,
"soft_token_num":100,
"token_init": False,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/t5-base/",
})
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['prefix_t5-base'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/t5-base/",
})
#### T5-base
BaseConfigs['t5-small'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}t5-small",
"tokenizer_name": f"{PATHBASE}t5-small",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
AllConfigs['prefix_t5-small'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/t5-small/",
})
#### ROBERTA######
BaseConfigs['roberta-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}roberta-base",
"tokenizer_name": f"{PATHBASE}roberta-base",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['bitfit_roberta-base'].update({
"delta_type": "bitfit",
"learning_rate": 1e-3,
"output_dir": "outputs/bitfit/roberta-base/",
})
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['none_roberta-base'].update({
"delta_type": "none",
"learning_rate": 1e-5,
"output_dir": "outputs/none/roberta-base/",
})
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['lora_roberta-base'].update({
"delta_type": "lora",
"learning_rate": 1e-3,
"output_dir": "outputs/lora/roberta-base/",
})
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['adapter_roberta-base'].update({
"delta_type": "adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/adapter/roberta-base/",
})
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['low_rank_adapter_roberta-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/low_rank_adapter/roberta-base/",
})
#### ROBERTA######
BaseConfigs['bert-base-cased'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}bert-base-cased",
"tokenizer_name": f"{PATHBASE}bert-base-cased",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['prefix_bert-base-cased'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/bert-base-cased/",
})
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['soft_prompt_bert-base-cased'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/bert-base-cased/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,163 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
# PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
#### ROBERTA######
BaseConfigs['roberta-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}roberta-base",
"tokenizer_name": f"{PATHBASE}roberta-base",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps",
"datasets_load_from_disk": True,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/"
}
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['bitfit_roberta-base'].update({
"delta_type": "bitfit",
"learning_rate": 1e-3,
"output_dir": "outputs/bitfit/roberta-base/",
})
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['none_roberta-base'].update({
"delta_type": "none",
"learning_rate": 1e-5,
"output_dir": "outputs/none/roberta-base/",
})
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['lora_roberta-base'].update({
"delta_type": "lora",
"learning_rate": 1e-3,
"output_dir": "outputs/lora/roberta-base/",
})
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['adapter_roberta-base'].update({
"delta_type": "adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/adapter/roberta-base/",
})
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['low_rank_adapter_roberta-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/low_rank_adapter/roberta-base/",
})
AllConfigs['soft_prompt_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['soft_prompt_roberta-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/roberta-base/",
})
AllConfigs['prefix_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['prefix_roberta-base'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/roberta-base/",
})
AllConfigs['prefix_roberta-large'] = copy.deepcopy(AllConfigs['prefix_roberta-base'])
AllConfigs['prefix_roberta-large'].update({
"output_dir": "outputs/prefix/prefix_roberta-large",
"model_name_or_path": f"{PATHBASE}prefix_roberta-large",
"tokenizer_name": f"{PATHBASE}prefix_roberta-large",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,300 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
# PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
BaseConfigs['t5-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}t5-base",
"tokenizer_name": f"{PATHBASE}t5-base",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hf": False,
"push_to_dc": True,
"save_strategy": "steps",
"datasets_load_from_disk": True,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"backbone_model": "t5", # use in delta center,
"model_path_public": "t5-base", # use in delta center,
}
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['bitfit_t5-base'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-base/",
})
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['adapter_t5-base'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/t5-base/",
})
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['lora_t5-base'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-base/",
})
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['compacter_t5-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter/t5-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['compacter++_t5-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"do_train": True,
"do_eval": True,
"do_test": True,
"modified_modules": [
"DenseReluDense"
],
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter++/t5-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['low_rank_adapter_t5-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/t5-base/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['soft_prompt_t5-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-2,
"soft_token_num":100,
"token_init": False,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/t5-base/",
})
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['prefix_t5-base'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"reparameterize": False,
"output_dir": "outputs/prefix/t5-base/",
})
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['soft_prompt_t5-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/t5-base/",
})
#### T5-base
BaseConfigs['t5-small'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}t5-small",
"tokenizer_name": f"{PATHBASE}t5-small",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
AllConfigs['prefix_t5-small'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/t5-small/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,52 @@
{
"backbone_model": "beit",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk",
"delta_type": "lora",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "cifar10",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224",
"model_path_public": "beit-large-patch16-224",
"num_classes": 10,
"num_train_epochs": 20,
"output_dir": "outputs/lora/beit-large-patch16-224/cifar10",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": false,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "cifar10",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "cifar10",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["query","value"]
}

View File

@ -0,0 +1,52 @@
{
"backbone_model": "gpt-j",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "lora",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "wikitext",
"eval_steps": 500,
"evaluation_strategy": "steps",
"gradient_accumulation_steps":4,
"greater_is_better": false,
"learning_rate": 0.00003,
"load_best_model_at_end": true,
"max_source_length": 512,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B",
"model_path_public": "gpt-j-6B",
"num_train_epochs": 2,
"output_dir": "outputs/lora/gpt-j-6B/wikitext",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 2,
"per_device_train_batch_size": 2,
"predict_with_generate": true,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 500,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "wikitext",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "wikitext",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["20.attn.q_proj","21.attn.q_proj","22.attn.q_proj","23.attn.q_proj","24.attn.q_proj","25.attn.q_proj","26.attn.q_proj","27.attn.q_proj"]
}

View File

@ -0,0 +1,52 @@
{
"backbone_model": "roberta-large",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "lora",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "superglue-boolq",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"is_seq2seq": false,
"learning_rate": 0.0001,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large",
"model_path_public": "roberta-large",
"num_train_epochs": 20,
"output_dir": "outputs/lora/roberta-large/superglue-boolq",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": false,
"push_to_hub": false,
"push_to_dc": true,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "superglue-boolq",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "superglue-boolq",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["query","value"]
}

View File

@ -0,0 +1,52 @@
{
"backbone_model": "xlm-roberta-large",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "lora",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "superglue-wic",
"eval_steps": 100,
"evaluation_strategy": "steps",
"greater_is_better": true,
"is_seq2seq": false,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large",
"model_path_public": "xlm-roberta-large",
"num_train_epochs": 20,
"output_dir": "outputs/lora/xlm-roberta-large/superglue-wic",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": false,
"push_to_dc": true,
"push_to_hub": false,
"save_steps": 100,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "superglue-wic",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "superglue-wic",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["query","value"]
}

View File

@ -0,0 +1,52 @@
{
"backbone_model": "gpt2",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "low_rank_adapter",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "wikitext",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps":1,
"greater_is_better": false,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 768,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt2",
"model_path_public": "gpt2",
"num_train_epochs": 2,
"output_dir": "outputs/low_rank_adapter/gpt2/wikitext",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": true,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "wikitext",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "wikitext",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt2",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["attn","mlp"]
}

View File

@ -0,0 +1,51 @@
{
"backbone_model": "bert-large-cased",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "prefix",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "rte",
"eval_steps": 100,
"evaluation_strategy": "steps",
"greater_is_better": true,
"is_seq2seq": false,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased",
"num_train_epochs": 20,
"output_dir": "outputs/prefix/bert-large-cased/rte",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": false,
"push_to_dc": true,
"push_to_hub": false,
"save_steps": 100,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "rte",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "rte",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["attention"]
}

View File

@ -0,0 +1,51 @@
{
"backbone_model": "bart",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "soft_prompt",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "superglue-boolq",
"eval_steps": 500,
"evaluation_strategy": "steps",
"gradient_accumulation_steps":1,
"greater_is_better": true,
"learning_rate": 0.1,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bart-large",
"model_path_public": "bart-large",
"num_train_epochs": 50,
"output_dir": "outputs/soft_prompt/bart-large/superglue-boolq",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 500,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"soft_token_num":100,
"split_validation_test": true,
"task_name": "superglue-boolq",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "superglue-boolq",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bart-large",
"token_init": true,
"unfrozen_modules": [
"deltas"
],
"warmup_steps": 0
}

View File

@ -0,0 +1,3 @@
from .tasks import TASK_MAPPING, AutoTask
# from .data_collator import TaskDataCollatorForSeq2Seq
# from .postprocessors import AutoPostProcessor

Some files were not shown because too many files have changed in this diff Show More