add: add opendelta package

fix: change to GPUS_PER_NODE=1
fix: change to GPUS_PER_NODE=2
2024-08-08 13:44:48 +08:00 · 2024-08-08 09:18:29 +08:00 · 2024-08-08 09:03:43 +08:00 · 2024-08-07 16:49:33 +08:00 · 2024-08-07 16:00:45 +08:00 · 2024-08-07 14:21:33 +08:00
234 changed files with 22122 additions and 2 deletions
--- a/FM_9G/apps/fm9g_2b/pretrain_dragonfly.sh
+++ b/FM_9G/apps/fm9g_2b/pretrain_dragonfly.sh
@ -222,7 +222,7 @@ fi
 GPUS_PER_NODE=1
 NNODES=1
 RANK=0
-MASTER_ENDPOINT=g3006
+MASTER_ENDPOINT=ubuntu
 MASTER_PORT=23456
 #CMD="torchrun --nnodes=${NNODES} --nproc_per_node=${GPUS_PER_NODE} --node_rank=${RANK} --master_addr=${MASTER_ENDPOINT} --master_port=${MASTER_PORT} ${PRETRAIN_ENTRY} ${OPTS}"
 CMD="torchrun --nnodes=${NNODES} --nproc_per_node=${GPUS_PER_NODE} --node_rank=${RANK}  --rdzv_id=1 --rdzv_backend=c10d --rdzv_endpoint=${MASTER_ENDPOINT}:${MASTER_PORT} ${PRETRAIN_ENTRY} ${OPTS}"
--- a/OpenDelta-0.3.2/.gitignore
+++ b/OpenDelta-0.3.2/.gitignore
@ -0,0 +1,71 @@
+data/
+**/__pycache__/
+logs/*
+experiments/logs
+!logs/.gitkeep
+datasets/*
+!datasets/*.sh
+.vscode/
+*.egg-info/
+eggs/
+.eggs/
+*.egg
+**.egg
+build/
+_build/
+**/build/
+outputs/
+log.txt
+**/DeltaHub/
+**/sfs_scripts/
+*beans/
+**/examples/*/configs/*
+!examples/*/configs/config_gen.py
+**/jupyter_notebook_examples/
+!examples/jupyter_notebook_examples/*.py
+!examples/*/configs/*.py
+**/outputs_search/**/*.bin
+**/outputs_search/**/*.pt
+
+
+*.db
+**/nohup.out
+**/examples/examples_bmtrain/BigModels/down_data
+**/examples/examples_bmtrain/BMTrain_stable
+**/examples/examples_bmtrain/BMPretrain
+**/examples/examples_bmtrain/BigModels/BigModels/results
+**/Delta_Memory/
+**/output/
+**/thunlp/
+**/saved_ckpts/
+
+
+DeltaCenter-Python-Client/
+backbone_structure
+delta_checkpoints
+gitop.sh
+load_dataset_and_model.ipynb
+load_model.py
+scripts
+t.py
+t.sh
+!examples/examples_prompt/configs/*/*.json
+!examples/examples_prompt/configs/**
+**/delta_checkpoints/
+**/outputs/
+
+dist/
+dist/*
+
+**/unittest/**
+!unittest/**.py
+!unittest/**.sh
+!unittest/**.md
+
+**/tutorial/**
+!tutorial/**.py
+!tutorial/**.sh
+!tutorial/**.md
+
+
+
--- a/OpenDelta-0.3.2/.readthedocs.yaml
+++ b/OpenDelta-0.3.2/.readthedocs.yaml
@ -0,0 +1,29 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 1
+
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-20.04
+  tools:
+    python: "3.9"
+    # You can also specify other tool versions:
+    # nodejs: "16"
+    # rust: "1.55"
+    # golang: "1.17"
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+   configuration: docs/conf.py
+
+# If using Sphinx, optionally build your docs in additional formats such as PDF
+# formats:
+#    - pdf
+
+# Optionally declare the Python requirements required to build your docs
+python:
+   install:
+   - requirements: docs/requirements.txt
--- a/OpenDelta-0.3.2/LICENSE
+++ b/OpenDelta-0.3.2/LICENSE
@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/OpenDelta-0.3.2/README.md
+++ b/OpenDelta-0.3.2/README.md
@ -0,0 +1,161 @@
+<div align="center">
+
+
+<img src="https://s4.ax1x.com/2022/02/14/Hy7lAf.png" width="350px">
+
+**An Open-Source Framework for Paramter-Efficient Tuning (Delta Tuning).**
+
+------
+
+<p align="center">
+  <a href="#Overview">Overview</a> •
+  <a href="#installation">Installation</a> •
+  <a href="https://opendelta.readthedocs.io/en/latest/notes/usage.html">Basic Usage</a> • 
+  <a href="https://opendelta.readthedocs.io/">Docs</a> • 
+  <a href="https://docs.google.com/spreadsheets/d/1BIVa8ocAPga-u7rBOXLYaTfaJSjI1dWfwohmLjmFDrY/edit?usp=sharing">Performance</a> •
+
+
+</p>
+
+</div>
+
+![version](https://img.shields.io/badge/version-0.3.2-blue)
+
+
+## Overview
+
+OpenDelta is a toolkit for parameter-efficient tuning methods (we dub it as *delta tuning*), by which users could flexibly assign (or add) a small amount parameters to update while keeping the most paramters frozen. By using OpenDelta, users could easily implement prefix-tuning, adapters, Lora, or any other types of delta tuning with preferred PTMs.
+
+- The latest version of OpenDelta is tested on Python==3.8.13, PyTorch==1.12.1, transformers==4.22.2. Other versions are likely to be supported as well. If you encounter bugs when using your own package versions, please raise an issue, we will look into it as soon as possible. 
+
+- **A demo of using Opendelta to modify the PLM (E.g., BART).**
+![How PLM changes using Delta-tuning](docs/source/imgs/demo.gif)
+
+## News
+- **2022.10.25** Release v0.3.2. Support [BMTrain]()! Improve docs. Add inspect utilities.
+- **2022.10.14** Release v0.3.0. We make the usage of default configurations of each delta tuning methods (i.e., the position they are attached) more friendly! If a custom model has our supported models as submodules inside, the default configuration is also available. Other key changes can be seen in [Update Log](https://opendelta.readthedocs.io/en/latest/notes/update.html#version-0-3-0)
+- **2022.10.10** Merge a long-developed branch v0.2.4 into the master branch. Key updates are (1) the an example unifying the delta tuning paradigm and the prompt-tuning paradigm; (2) and support for [Delta Center](https://www.openbmb.org/toolKits/deltacenter), whose webpage is still under construction. Details can be seen in [Update Log](https://opendelta.readthedocs.io/en/latest/notes/update.html#version-0-2-4)
+- **2022.03.24** We notice several bugs in Soft Prompt Tuning and Prefix Tuning, mainly due to their need to customize attention ids, token_type_ids, we are fixing it! Currently, please use the other methods since they are stabler and better in performance. 
+- **2022.03.20** Add a [colab example](https://colab.research.google.com/drive/1uAhgAdc8Qr42UKYDlgUv0f7W1-gAFwGo?usp=sharing) to illustrate efficient training and space-saving multitask-serving.
+- **2022.03.20** A new pip version released.
+- **2022.02.16** Support [regular expression](https://opendelta.readthedocs.io/en/latest/notes/namebasedaddr.html#regexexpr) in named-based addressing. 
+
+## Installation
+1. create a virtualenv (optional)
+```shell
+conda create -n opendelta_env python=3.8
+conda activate opendelta_env
+```
+
+2 install the lastest version
+```bash
+pip install git+https://github.com/thunlp/OpenDelta.git
+```
+
+**or** install the lastest pip version (more stable)
+```bash
+pip install opendelta
+```
+**or** build from source
+```bash
+git clone git@github.com:thunlp/OpenDelta.git
+cd OpenDelta
+python setup.py install
+# python setup.py develop # if you want to do some modifications on the code for your research:
+
+```
+
+## Must Try
+The following codes and comments walk you through the key functionality of OpenDelta. It is also in [must_try.py](https://github.com/thunlp/OpenDelta/tree/main/examples/unittest/must_try.py) and [must_try.ipynb in colab](https://colab.research.google.com/drive/1Nbe9zxt8LGQnKmtvEs07IN_PznjNCyk4?usp=sharing).
+
+```python
+# use tranformers as usual.
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
+t5_tokenizer = AutoTokenizer.from_pretrained("t5-large")
+# A running example
+inputs_ids = t5_tokenizer.encode("Is Harry Poter wrtten by JKrowling", return_tensors="pt")
+t5_tokenizer.decode(t5.generate(inputs_ids)[0]) 
+# >>> '<pad><extra_id_0>? Is it Harry Potter?</s>'
+
+
+# use existing delta models
+from opendelta import AutoDeltaModel, AutoDeltaConfig
+# use existing delta models from DeltaCenter
+delta = AutoDeltaModel.from_finetuned("thunlp/Spelling_Correction_T5_LRAdapter_demo", backbone_model=t5)
+# freeze the whole backbone model except the delta models.
+delta.freeze_module()
+# visualize the change
+delta.log()
+
+
+t5_tokenizer.decode(t5.generate(inputs_ids)[0]) 
+# >>> <pad> Is Harry Potter written by JK Rowling?</s>
+
+
+# Now save merely the delta models, not the whole backbone model, to tmp/
+delta.save_finetuned(".tmp")
+import os; os.listdir(".tmp")
+# >>>  The state dict size is 1.443 MB
+# >>>  We encourage users to push their final and public models to delta center to share them with the community!
+
+
+# reload the model from local url and add it to pre-trained T5.
+t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
+delta1 = AutoDeltaModel.from_finetuned(".tmp", backbone_model=t5)
+import shutil; shutil.rmtree(".tmp") # don't forget to remove the tmp files. 
+t5_tokenizer.decode(t5.generate(inputs_ids)[0]) 
+# >>> <pad> Is Harry Potter written by JK Rowling?</s>
+
+# detach the delta models, the model returns to the unmodified status.
+delta1.detach()
+t5_tokenizer.decode(t5.generate(inputs_ids)[0])  
+# >>> '<pad><extra_id_0>? Is it Harry Potter?</s>'
+
+# use default configuration for cunstomized wrapped models which have PLMs inside. This is a common need for users. 
+import torch.nn as nn
+class WrappedModel(nn.Module):
+  def __init__(self, inner_model):
+    super().__init__()
+    self.inner = inner_model
+  def forward(self, *args, **kwargs):
+    return self.inner(*args, **kwargs)
+
+wrapped_model = WrappedModel(WrappedModel(t5))
+
+# say we use LoRA
+delta_config = AutoDeltaConfig.from_dict({"delta_type":"lora"})
+delta2 = AutoDeltaModel.from_config(delta_config, backbone_model=wrapped_model)
+delta2.log()
+# >>> root
+#       -- inner
+#          -- inner
+#             ...
+#             ... lora_A:[8,1024], lora_B:[1024,8]
+delta2.detach()
+
+# use a not default configuration
+# say we add lora to the last four layer of the decoder of t5, with lora rank=5
+delta_config3 = AutoDeltaConfig.from_dict({"delta_type":"lora", "modified_modules":["[r]decoder.*((20)|(21)|(22)|(23)).*DenseReluDense\.wi"], "lora_r":5})
+delta3 = AutoDeltaModel.from_config(delta_config3, backbone_model=wrapped_model)
+delta3.log()
+
+```
+
+## Verified Default Configurations  
+
+- **You can try to use OpenDelta on *any* backbone models based on PyTorch.**  
+- However, with small chances that the interface of the submodules of the backbone model is not supported. Therefore we verified some commonly
+used models that OpenDelta are sure to support.
+
+- We will keep testing more and more emerging models.
+
+- Pull requests are welcomed when you successfully apply OpenDelta on your own backbone model.
+
+
+
+
+
+
+
+
--- a/OpenDelta-0.3.2/docs/Makefile
+++ b/OpenDelta-0.3.2/docs/Makefile
@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/OpenDelta-0.3.2/docs/make.bat
+++ b/OpenDelta-0.3.2/docs/make.bat
@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/OpenDelta-0.3.2/docs/readme.md
+++ b/OpenDelta-0.3.2/docs/readme.md
@ -0,0 +1,20 @@
+# OpenDelta Documentation
+
+To build this doc locally, please firstly install [sphinx](https://www.sphinx-doc.org/en/master/) packages.
+
+```
+pip install sphinx
+pip install sphinx_rtd_theme
+pip install sphinx_copybutton
+pip install sphinx_toolbox
+pip install myst_parser
+```
+
+Then install opendelta either from source, or from pip. After that,
+
+```
+cd docs
+make html
+```
+
+Then open the generated `docs/build/html/index.html` in your local browser. 
--- a/OpenDelta-0.3.2/docs/requirements.txt
+++ b/OpenDelta-0.3.2/docs/requirements.txt
@ -0,0 +1,17 @@
+sphinx_copybutton
+sphinx_rtd_theme
+sphinx_toolbox
+myst_parser
+
+torch>=1.8.0
+transformers>=4.10.0
+datasets==1.17.0
+sentencepiece>=0.1.96
+tqdm>=4.62.2
+decorator
+rich
+web.py
+gitpython
+scipy # need?
+sklearn # need?
+delta_center_client==0.0.4
--- a/OpenDelta-0.3.2/docs/source/_static/css/custom.css
+++ b/OpenDelta-0.3.2/docs/source/_static/css/custom.css
@ -0,0 +1,268 @@
+/* a, */
+.wy-menu-vertical header,
+.wy-menu-vertical p.caption,
+.wy-nav-top .fa-bars,
+.wy-menu-vertical a:hover,
+
+/* Colors and text decoration.
+ For example, :black:`text in black` or :blink:`text blinking` in rST. */
+
+ /* .black {
+    color: black;
+}
+
+.gray {
+    color: gray;
+}
+
+.grey {
+    color: gray;
+}
+
+.silver {
+    color: silver;
+}
+
+.white {
+    color: white;
+}
+
+.maroon {
+    color: maroon;
+}
+
+.red {
+    color: red;
+}
+
+.magenta {
+    color: magenta;
+}
+
+.fuchsia {
+    color: fuchsia;
+}
+
+.pink {
+    color: pink;
+}
+
+.orange {
+    color: rgba(218, 135, 12, 0.897);
+} */
+
+/* .string {
+	color: rgb(172, 51, 44);
+} */
+
+/* .yellow {
+    color: yellow;
+}
+
+.lime {
+    color: lime;
+}
+
+.green {
+    color: green;
+}
+
+.olive {
+    color: olive;
+}
+
+.teal {
+    color: teal;
+}
+
+.cyan {
+    color: cyan;
+}
+
+.aqua {
+    color: aqua;
+}
+
+.blue {
+    color: blue;
+}
+
+.navy {
+    color: navy;
+}
+
+.purple {
+    color: purple;
+}
+
+.under {
+    text-decoration: underline;
+}
+
+.over {
+    text-decoration: overline;
+}
+
+.blink {
+    text-decoration: blink;
+}
+
+.line {
+    text-decoration: line-through;
+}
+
+.strike {
+    text-decoration: line-through;
+}
+
+.it {
+    font-style: italic;
+}
+
+.ob {
+    font-style: oblique;
+}
+
+.small {
+    font-size: small;
+}
+
+.large {
+    font-size: large;
+}
+
+.smallpar {
+    font-size: small;
+} */
+
+a:link {
+	color: rgb(141, 99, 224)
+}
+
+a:visited {
+	color: rgb(141, 99, 224)
+}
+
+a:hover {
+	color: rgb(147, 47, 218)
+}
+.rst-content code.literal
+{
+	color: rgb(172, 49, 42) !important; 
+	/* #5360f0 */
+}
+
+.rst-content tt.literal
+{
+	color: #f06b53 !important; 
+}
+/* #a153f0  */
+/* inspired by sphinx press theme */
+.wy-menu.wy-menu-vertical li.toctree-l1.current > a {
+	border-left: solid 15px rgb(150, 92, 232) !important;
+	text-indent: -15px;
+	border-top: none;
+	border-bottom: none;
+}
+
+.wy-menu.wy-menu-vertical li.toctree-l1.current > ul {
+	border-left: solid 15px #ddcaf7 !important;
+}
+/* inspired by sphinx press theme */
+
+.wy-nav-side {
+	color: unset !important;
+	background: unset !important;
+	border-right: solid 1px #ccc !important;
+}
+
+.wy-side-nav-search,
+.wy-nav-top,
+.wy-menu-vertical li,
+.wy-menu-vertical li a:hover,
+.wy-menu-vertical li a
+{
+	background: unset !important;
+}
+
+.wy-menu-vertical li.current a {
+	border-right: unset !important;
+}
+
+.wy-side-nav-search div,
+.wy-menu-vertical a {
+	color: #404040 !important;
+}
+
+.wy-menu-vertical button.toctree-expand {
+	color: #333 !important;
+}
+
+.wy-nav-content {
+	max-width: unset;
+}
+
+.rst-content {
+	max-width: 900px;
+}
+
+.wy-nav-content .icon-home:before {
+	content: "Docs";
+}
+
+.wy-side-nav-search .icon-home:before {
+	content: "";
+}
+
+dl.field-list {
+	display: block !important;
+}
+
+dl.field-list > dt:after {
+	content: "" !important;
+}
+
+dl.field-list > dt {
+	display: table;
+	padding-left: 6px !important;
+	padding-right: 6px !important;
+	margin-bottom: 4px !important;
+	padding-bottom: 1px !important;
+	background: rgb(252, 237, 208);
+	border-left: solid 2px rgb(231, 181, 134);
+}
+
+
+dl.py.class>dt
+{
+	color: rgba(17, 16, 17, 0.822) !important;
+	background: rgb(247, 234, 252) !important;
+	border-top: solid 2px #b620d0 !important;
+}
+
+dl.py.method>dt
+{
+	background: rgb(250, 239, 241) !important;
+	border-left: solid 2px rgb(199, 83, 106) !important;
+}
+
+dl.py.attribute>dt,
+dl.py.property>dt
+{
+	background: rgba(194, 233, 248, 0.1) !important;
+	border-left: solid 2px #58b5cc !important;
+}
+
+.fa-plus-square-o::before, .wy-menu-vertical li button.toctree-expand::before,
+.fa-minus-square-o::before, .wy-menu-vertical li.current > a button.toctree-expand::before, .wy-menu-vertical li.on a button.toctree-expand::before
+{
+	content: "";
+}
+
+.rst-content .viewcode-back,
+.rst-content .viewcode-link
+{
+	font-size: 120%;
+}
+
+
--- a/OpenDelta-0.3.2/docs/source/_static/js/custom.js
+++ b/OpenDelta-0.3.2/docs/source/_static/js/custom.js
@ -0,0 +1,7 @@
+document.addEventListener("DOMContentLoaded", function(event) {
+	document.querySelectorAll(".wy-menu.wy-menu-vertical > ul.current > li > a").forEach(a => a.addEventListener("click", e=>{
+		f = document.querySelector(".wy-menu.wy-menu-vertical > ul.current > li > ul")
+		if (f.style.display=='none') { f.style.display='block'; } else f.style.display = 'none'
+	}));
+	document.querySelectorAll(".headerlink").forEach(a => a.text="\u{1F517}");
+});
--- a/OpenDelta-0.3.2/docs/source/conf.py
+++ b/OpenDelta-0.3.2/docs/source/conf.py
@ -0,0 +1,147 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+import sys
+sys.path.insert(0, "../../")
+import datetime
+import sphinx_rtd_theme
+import doctest
+import opendelta
+
+
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'OpenDelta'
+author = 'THUNLP OpenDelta Team'
+copyright = '{}, {}, Licenced under the Apache License, Version 2.0'.format(datetime.datetime.now().year, author)
+
+
+# The full version, including alpha/beta/rc tags
+release = '0.3.2'
+version = "0.3.2"
+
+html_theme = 'sphinx_rtd_theme'
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+doctest_default_flags = doctest.NORMALIZE_WHITESPACE
+autodoc_member_order = 'bysource'
+intersphinx_mapping = {'python': ('https://docs.python.org/', None),
+"torch": ("https://pytorch.org/docs/stable/", None),}
+
+html_show_sourcelink = True
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.doctest',
+    'sphinx.ext.intersphinx',
+    # 'sphinx.ext.mathbase',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'sphinx.ext.githubpages',
+    'sphinx_copybutton',
+    'sphinx_toolbox.collapse',
+    'myst_parser',
+]
+
+myst_enable_extensions = [
+    "html_image", 
+    "colon_fence", 
+    "html_admonition",
+    "amsmath",
+    "dollarmath",
+]
+
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.txt': 'markdown',
+    '.md': 'markdown',
+}
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+# exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+# html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_theme_options = {
+    # 'collapse_navigation': False,
+    # 'display_version': True,
+    #'logo_only': False,
+    'navigation_depth': 2,
+}
+
+
+html_static_path = ['_static']
+html_css_files = ['css/custom.css']
+html_js_files = ['js/custom.js']
+rst_context = {'opendelta': opendelta}
+# rst_epilog = "\n.. include:: .special.rst\n"
+add_module_names = False
+
+def include_only_tagged(app, what, name, obj, skip, options):
+    inclusion_tag_format = "[NODOC]" #can be any pattern here, choose what works for you
+    for tag in app.tags.tags:
+        if obj.__doc__ is not None and not obj.__doc__.startswith(inclusion_tag_format):
+            return False
+    return True
+
+def skip2(app, what, name, obj, skip, options):
+        members = [
+            '__init__',
+            '__repr__',
+            '__weakref__',
+            '__dict__',
+            '__module__',
+        ]
+        return True if name in members else skip
+
+def skip(app, what, name, obj, skip, options):
+    skip = include_only_tagged(app, what, name, obj, skip, options) or\
+            skip2(app, what, name, obj, skip, options)
+    return skip
+
+def setup(app):
+    
+    
+
+    def rst_jinja_render(app, docname, source):
+        src = source[0]
+        rendered = app.builder.templates.render_string(src, rst_context)
+        source[0] = rendered
+
+    app.connect('autodoc-skip-member', skip)
+    app.connect("source-read", rst_jinja_render)
--- a/OpenDelta-0.3.2/docs/source/imgs/afterfreeze.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/afterfreeze.png
--- a/OpenDelta-0.3.2/docs/source/imgs/bart-base.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/bart-base.png
--- a/OpenDelta-0.3.2/docs/source/imgs/bert_vis.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/bert_vis.png
--- a/OpenDelta-0.3.2/docs/source/imgs/bertdelta_noparam.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/bertdelta_noparam.png
--- a/OpenDelta-0.3.2/docs/source/imgs/bertdelta_vis.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/bertdelta_vis.png
--- a/OpenDelta-0.3.2/docs/source/imgs/commonstructure_vis.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/commonstructure_vis.png
--- a/OpenDelta-0.3.2/docs/source/imgs/composition_of_delta.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/composition_of_delta.png
--- a/OpenDelta-0.3.2/docs/source/imgs/defaultmodification.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/defaultmodification.png
--- a/OpenDelta-0.3.2/docs/source/imgs/demo.gif
+++ b/OpenDelta-0.3.2/docs/source/imgs/demo.gif
--- a/OpenDelta-0.3.2/docs/source/imgs/hint-icon-2.jpg
+++ b/OpenDelta-0.3.2/docs/source/imgs/hint-icon-2.jpg
--- a/OpenDelta-0.3.2/docs/source/imgs/hint-icon.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/hint-icon.png
--- a/OpenDelta-0.3.2/docs/source/imgs/interact.jpg
+++ b/OpenDelta-0.3.2/docs/source/imgs/interact.jpg
--- a/OpenDelta-0.3.2/docs/source/imgs/multiple_to_one_layer.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/multiple_to_one_layer.png
--- a/OpenDelta-0.3.2/docs/source/imgs/name_based_addressing.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/name_based_addressing.png
--- a/OpenDelta-0.3.2/docs/source/imgs/plugunplug1.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/plugunplug1.png
--- a/OpenDelta-0.3.2/docs/source/imgs/plugunplug2.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/plugunplug2.png
--- a/OpenDelta-0.3.2/docs/source/imgs/plugunplug3.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/plugunplug3.png
--- a/OpenDelta-0.3.2/docs/source/imgs/plugunplug4.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/plugunplug4.png
--- a/OpenDelta-0.3.2/docs/source/imgs/plugunplug5.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/plugunplug5.png
--- a/OpenDelta-0.3.2/docs/source/imgs/plugunplug6.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/plugunplug6.png
--- a/OpenDelta-0.3.2/docs/source/imgs/pointing-right-finger.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/pointing-right-finger.png
--- a/OpenDelta-0.3.2/docs/source/imgs/raw_print.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/raw_print.png
--- a/OpenDelta-0.3.2/docs/source/imgs/t5lora.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/t5lora.png
--- a/OpenDelta-0.3.2/docs/source/imgs/todo-icon.jpeg
+++ b/OpenDelta-0.3.2/docs/source/imgs/todo-icon.jpeg
--- a/OpenDelta-0.3.2/docs/source/imgs/toy-delta.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/toy-delta.png
--- a/OpenDelta-0.3.2/docs/source/imgs/transformers_structure.png
+++ b/OpenDelta-0.3.2/docs/source/imgs/transformers_structure.png
--- a/OpenDelta-0.3.2/docs/source/index.md
+++ b/OpenDelta-0.3.2/docs/source/index.md
@ -0,0 +1,75 @@
+OpenDelta's documentation!
+=====================================
+
+[OpenDelta](https://github.com/thunlp/OpenDelta/) is a **Plug-and-play** Library of the parameter-efficient fine-tuning ([delta-tuning](WhatisDelta)) technology for pre-trained models.
+
+
+## Essential Advantages:
+
+- <span style="color:rgb(81, 217, 245);font-weight:bold">Clean:</span> No need to edit the backbone PTM’s codes.
+- <span style="color:orange;font-weight:bold">Simple:</span> Migrating from full-model tuning to delta-tuning needs as little as 3 lines of codes.
+- <span style="color:green;font-weight:bold">Sustainable:</span> Most evolution in external library doesn’t require a new OpenDelta.
+- <span style="color:red;font-weight:bold">Extendable:</span> Various PTMs can share the same delta-tuning codes.
+- <span style="color:purple;font-weight:bold">Flexible:</span> Able to apply delta-tuning to (almost) any position of the PTMs.
+
+```{eval-rst}
+.. toctree::
+   :maxdepth: 1
+   :caption: Getting Started
+
+   notes/overview.md
+   notes/installation.md
+   notes/quickstart.md
+   notes/custom.md
+   
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Advanced Usage
+   
+   notes/autodelta.md
+   notes/deltacenter.md
+   notes/composition.md
+   notes/pluginunplug.md
+   notes/withbmtrain.md
+   notes/withaccelerate.md
+   notes/examples.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Utilities
+   
+   notes/inspect.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Mechanisms
+
+   notes/keyfeature.md
+   notes/namebasedaddr.md
+   notes/unifyname.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Information
+
+   notes/citation.md
+   notes/update.md
+   notes/faq.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Documentation
+
+   modules/base
+   modules/deltas
+   modules/auto_delta
+   modules/utils
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+
+```
--- a/OpenDelta-0.3.2/docs/source/modules/auto_delta.rst
+++ b/OpenDelta-0.3.2/docs/source/modules/auto_delta.rst
@ -0,0 +1,14 @@
+Auto Classes
+======================================
+
+
+AutoDeltaConfig
+------------------------------------
+.. autoclass:: opendelta.auto_delta.AutoDeltaConfig
+    :members:
+
+
+AutoDeltaModel
+------------------------------------
+.. autoclass:: opendelta.auto_delta.AutoDeltaModel
+    :members:
--- a/OpenDelta-0.3.2/docs/source/modules/base.rst
+++ b/OpenDelta-0.3.2/docs/source/modules/base.rst
@ -0,0 +1,14 @@
+Base Classes
+======================================
+
+
+BaseDeltaConfig
+------------------------------------
+.. autoclass:: opendelta.delta_configs.BaseDeltaConfig
+    :members:
+
+
+DeltaBase
+------------------------------------
+.. autoclass:: opendelta.basemodel.DeltaBase
+    :members:
--- a/OpenDelta-0.3.2/docs/source/modules/deltas.rst
+++ b/OpenDelta-0.3.2/docs/source/modules/deltas.rst
@ -0,0 +1,46 @@
+Delta Models
+======================================
+
+
+
+Lora
+---------------------------------------
+.. autoclass:: opendelta.LoraModel
+    :members:
+
+
+
+BitFit
+---------------------------------------
+.. autoclass:: opendelta.BitFitModel
+    :members:
+
+
+Adapter
+---------------------------------------
+.. autoclass:: opendelta.AdapterModel
+    :members:
+
+
+LowRankAdapter
+---------------------------------------
+.. autoclass:: opendelta.LowRankAdapterModel
+    :members:
+
+
+Compacter
+---------------------------------------
+.. autoclass:: opendelta.CompacterModel
+    :members:
+
+
+Prefix tuning
+------------------------------------
+.. autoclass:: opendelta.PrefixModel
+    :members:
+
+
+Soft Prompt Tuning
+------------------------------------
+.. autoclass:: opendelta.SoftPromptModel
+    :members:
--- a/OpenDelta-0.3.2/docs/source/modules/utils.md
+++ b/OpenDelta-0.3.2/docs/source/modules/utils.md
@ -0,0 +1,45 @@
+# Utils
+
+
+## SaveLoadMixin
+
+```{eval-rst}
+.. autoclass:: opendelta.utils.saving_loading_utils.SaveLoadMixin
+    :members:
+```
+
+## Visualization
+
+
+```{eval-rst}
+.. autoclass:: opendelta.utils.visualization.Visualization
+    :members:
+```
+
+## Structure Map
+```{eval-rst}
+.. autoclass:: opendelta.utils.structure_mapping.CommonStructureMap
+    :members:
+```
+
+## Utility Functions
+
+### Hashing
+```{eval-rst}
+.. automodule:: opendelta.utils.model_md5
+    :members:
+``` 
+
+### Signature
+```{eval-rst}
+.. automodule:: opendelta.utils.signature
+    :members:
+```
+
+### Named-based addressing
+```{eval-rst}
+.. automodule:: opendelta.utils.name_based_addressing
+    :members:
+```
+
+
--- a/OpenDelta-0.3.2/docs/source/notes/autodelta.md
+++ b/OpenDelta-0.3.2/docs/source/notes/autodelta.md
@ -0,0 +1,90 @@
+(autodelta)=
+# AutoDelta Mechanism
+
+Inspired by [Huggingface transformers AutoClasses](https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/auto#transformers.AutoModel) , we provide an AutoDelta features for the users to
+
+1. Easily to experiment with different delta models
+2. Fast deploy from configuration file, especially from the repos in [DeltaCenter](https://examplelink).
+
+
+## Easily load from dict, so that subject to change the type of delta models.
+
+```python
+from opendelta import AutoDeltaConfig, AutoDeltaModel
+from transformers import T5ForConditionalGeneration
+
+backbone_model = T5ForConditionalGeneration.from_pretrained("t5-base")
+```
+
+We can load a config from a dict
+```python
+config_dict = {
+    "delta_type":"lora", 
+    "modified_modules":[
+        "SelfAttention.q", 
+        "SelfAttention.v",
+        "SelfAttention.o"
+    ], 
+    "lora_r":4}
+delta_config = AutoDeltaConfig.from_dict(config_dict)
+```
+
+Then use the config to add a delta model to the backbone model
+```python
+delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=backbone_model)
+
+# now visualize the modified backbone_model
+from bigmodelvis import Visualization
+Visualizaiton(backbone_model).structure_graph()
+```
+
+
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/t5lora.png
+---
+width: 600px
+name: t5lora
+---
+```
+````
+
+
+
+## Fast deploy from a finetuned delta checkpoints from DeltaCenter
+
+```python
+# use tranformers as usual.
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
+t5_tokenizer = AutoTokenizer.from_pretrained("t5-large")
+# A running example
+inputs_ids = t5_tokenizer.encode("Is Harry Poter wrtten by JKrowling", return_tensors="pt")
+t5_tokenizer.decode(t5.generate(inputs_ids)[0]) 
+# >>> '<pad><extra_id_0>? Is it Harry Potter?</s>'
+```
+
+Load delta model from delta center:
+```python
+# use existing delta models
+from opendelta import AutoDeltaModel, AutoDeltaConfig
+# use existing delta models from DeltaCenter
+delta = AutoDeltaModel.from_finetuned("thunlp/Spelling_Correction_T5_LRAdapter_demo", backbone_model=t5)
+# freeze the whole backbone model except the delta models.
+delta.freeze_module()
+# visualize the change
+delta.log()
+
+t5_tokenizer.decode(t5.generate(inputs_ids)[0]) 
+# >>> <pad> Is Harry Potter written by JK Rowling?</s>
+```
+
+<div class="admonition note">
+<p class="title">**Hash check**</p>
+Since the delta model only works together with the backbone model.
+we will automatically check whether you load the delta model the same way it is trained.
+</p>
+<p>
+We calculate the trained model's [md5](http://some_link) and save it to the config. When finishing loading the delta model, we will re-calculate the md5 to see whether it changes.
+<p> Note that performance is guaranteed by passing the hash check, but there are cases where the hash check is not passed but performance is still normal for various reasons. We are checking the reasons for this. Please consider this feature as a supplement. </p>
+<p>Pass `check_hash=False` to disable the hash checking.</p>
+</div>
--- a/OpenDelta-0.3.2/docs/source/notes/citation.md
+++ b/OpenDelta-0.3.2/docs/source/notes/citation.md
@ -0,0 +1,12 @@
+# Citation
+
+If you find our repo useful, please cite the following paper. 
+
+```
+@article{ding2022delta,
+  title={Delta tuning: A comprehensive study of parameter efficient methods for pre-trained language models},
+  author={Ding, Ning and Qin, Yujia and Yang, Guang and Wei, Fuchao and Yang, Zonghan and Su, Yusheng and Hu, Shengding and Chen, Yulin and Chan, Chi-Min and Chen, Weize and others},
+  journal={arXiv preprint arXiv:2203.06904},
+  year={2022}
+}
+```
--- a/OpenDelta-0.3.2/docs/source/notes/composition.md
+++ b/OpenDelta-0.3.2/docs/source/notes/composition.md
@ -0,0 +1,51 @@
+# Composition of delta models
+
+With OpenDelta, you can perform compostion of different delta models.
+
+
+## Add different deltas to the backbone
+
+```
+from transformers import AutoModelForSequenceClassification
+model = AutoModelForSequenceClassification.from_pretrained("roberta-base")
+from opendelta import LoraModel, AdapterModel
+delta_model = LoraModel(backbone_model=model, modified_modules=['key'], lora_r=1)
+delta_model2 = AdapterModel(backbone_model=model, modified_modules=['output'], bottleneck_dim=12)
+delta_model.log()
+```
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/composition_of_delta.png
+---
+width: 600px
+name: composition_of_delta
+---
+```
+````
+
+
+
+## Even add multiple delta to the same layer
+
+```
+from transformers import AutoModelForSequenceClassification
+model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base")
+from opendelta import AdapterModel, LowRankAdapterModel
+delta_model = AdapterModel(backbone_model=model, modified_modules=['fc2'])
+delta_model2 = AdapterModel(backbone_model=model, modified_modules=['fc2'], bottleneck_dim=12)
+delta_model3 = LowRankAdapterModel(backbone_model=model, modified_modules=['fc2'], reduction_factor=12)
+delta_model.log()
+```
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/multiple_to_one_layer.png
+---
+width: 600px
+name: multiple_to_one_layer
+---
+```
+````
+:::{admonition} Order of Insertion
+:class: warning
+**When adding to the same layer, please pay attention to the order of adding delta.** As the above example, adapter is added after the `fc2`, the tensor will first go through `adapter` then go through `adapter_1`, at last `compacter`. If the delta is added before the backbone layer, then the last added delta will be the first to go through.
+
+Also, pay attention to the detaching order. The delta that is first added should be the last to be detached. 
+:::
--- a/OpenDelta-0.3.2/docs/source/notes/custom.md
+++ b/OpenDelta-0.3.2/docs/source/notes/custom.md
@ -0,0 +1,135 @@
+# Custom Usage
+Now we introduce the pipeline to migrate your full-model tuning scripts to a delta tuning one, **especial when your model is not in the default configuration list, or you don't want to use ghte default configuration**.
+
+## STEP 1: Load the pretrained models
+
+```python
+from transformers import AutoModelForSequenceClassification
+model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base") # suppose we load BART
+```
+
+## STEP 2: Add delta modules
+We provide two alternatives to add the delta modules.
+### 2.1 Visualize the backbone structure
+Delta tuning's core change in the structure of the base model is to decorate (modify) the modules of the base model with small delta modules. We assume we want to treat the feedforward layer of each block as our [target modules](targetmodules). Since **different PLM name the submodules differently**,
+We should first know the name of the feedforward layer in the BART model by visualization. <img src="../imgs/hint-icon-2.jpg" height="30px"> *For more about visualization, see [Visualization](visualization).*
+
+```python
+from bigmodelvis import Visualization
+Visualization(model).structure_graph()
+```
+
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/bart-base.png
+---
+width: 600px
+name: bart-base
+---
+```
+````
+
+
+We can see from the structure graph that the feed forward layer in Bart is called `model.encoder.layers.$.fc1` and `model.encoder.layers.$.fc2`, where
+`$` represent a number from 0-5.  Since we want to apply adapter after *all* the feed forward layers, we specify the `modified_modules=['fc2']`, which is the common suffix for feed forward layers.
+<img src="../imgs/hint-icon-2.jpg" height="30px">  *For details about the name based addressing, see [Name-based submodule addressing](namebasedaddr)*
+
+Other configurations, such as the `bottleneck_dim` in Adapter, can be passed as key word arguments.
+```python
+from opendelta import AdapterModel
+delta_model = AdapterModel(backbone_model=model, modified_modules=['fc2'], bottleneck_dim=12)
+delta_model.log() # This will visualize the backbone after modification and other information.
+```
+
+
+:::{admonition} Try different positions
+:class: tip
+OpenDelta provide the flexibility to add delta to various positions on the backbone model. For example, If you want to move the adapter in the above example after the layer norm of the feed forward layer. The code should be changed into
+```python
+delta_model = AdapterModel(backbone_model=model, modified_modules=['final_layer_norm'], bottleneck_dim=12)
+```
+The performance may vary due to positional differences, but there is currently theorectical guarantee that one will outperform the other.
+:::
+
+
+:::{admonition} Favored Configurations
+:class: tip
+Feel confused about the flexibility that OpenDelta brings? The default configuration is the `default_modified_modules` attributes of each Delta model. Generally, the default configurations are already good enough. If you want squeeze the size of delta models further, you can refer to the following papers.
+
+ - [AdapterDrop: On the Efficiency of Adapters in Transformers](https://arxiv.org/abs/2010.11918)
+ - [Sparse Structure Search for Parameter-Efficient Tuning(Delta Tuning)](https://arxiv.org/abs/2206.07382)
+:::
+
+## STEP 3: Freeze parameters
+So far the backbone model is still fully tunable. To freeze the main part of the backbone model except the trainable parts (usually the delta paramters), use [freeze_module](opendelta.basemodel.DeltaBase.freeze_module) method. The syntax of `exclude` field also obeys the [name-based addressing](namebasedaddr) rules.
+
+
+```python
+delta_model.freeze_module(exclude=["deltas", "layernorm_embedding"])
+delta_model.log()
+```
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/afterfreeze.png
+---
+width: 600px
+name: afterfreeze
+---
+```
+````
+
+Usually, we want to only save the trainable part, then we should modify the `state_dict` of the backbone model which original contains all the parameters. Now with `set_state_dict=True`, the `model.state_dict()` only contains the trainable parameters.
+```python
+delta_model.freeze_module(exclude=["deltas", "layernorm_embedding"], set_state_dict=True)
+```
+
+
+
+
+
+## STEP 4: Normal training pipeline
+
+The **model** then can be trained in traditional training scripts. Two things should be noticed:
+
+:::{admonition} Note
+:class: note
+1. No need to change the optimizer, since the optimizer will only calculated and store gradient for those parameters with `requires_grad=True`, and the `requires_grad` attribute has been changed during the call to [freeze_module](opendelta.basemodel.DeltaBase.freeze_module) method.
+2. `model.eval()` or `model.train()` should be used if we need to enable/disable dropout. Opendelta doesn't touch those configuration.
+:::
+
+
+## STEP 5: Save and load the Delta Model
+### Option1: Use opendelta interface.
+One option is to use our provided interface. This will save both the configurations of the delta model and the parameters of all trainable parameters.
+```python
+delta_model.save_finetuned("some_local_path/")
+```
+When loading the delta_model, just call the `from_finetuned` methods. Note that the loaded model is fully trainable. If you want to continue to train it, please use `freeze_module` again.  
+```python
+from transformers import AutoModelForSequenceClassification
+model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base") 
+from opendelta import AutoDeltaModel
+delta_model = AutoDeltaModel.from_finetuned("some_local_path/", backbone_model=model)
+```
+
+### Option2: Use pytorch interface.
+Another option is to load the model using traditional pytorch ways.
+```python
+torch.save(model.state_dict(), "some_local_path/pytorch_model.bin")
+```
+Then load it into an initialied backbone model with delta model. Remember to use `strict=False` since now the state_dict contains only the trainable parameters. 
+
+```python
+from transformers import AutoModelForSequenceClassification
+model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base") 
+from opendelta import AdapterModel
+delta_model = AdapterModel(backbone_model=model, modified_modules=['fc2'], bottleneck_dim=12)
+model.load_state_dict(torch.load("some_local_path/pytorch_model.bin"), strict=False)
+```
+
+### Option3: Save and upload to DeltaCenter.
+You can also save the delta model to delta center to share with the community. See [instructions](deltacenter).
+
+
+
+
+
+
--- a/OpenDelta-0.3.2/docs/source/notes/deltacenter.md
+++ b/OpenDelta-0.3.2/docs/source/notes/deltacenter.md
@ -0,0 +1,35 @@
+# DeltaCenter
+
+## Share to Delta Center.
+```python
+delta_model.save_finetuned("test_delta_model", push_to_dc = True)
+```
+
+##  Download from Delta Center.
+```python
+# use tranformers as usual.
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
+t5_tokenizer = AutoTokenizer.from_pretrained("t5-large")
+# A running example
+inputs_ids = t5_tokenizer.encode("Is Harry Poter wrtten by JKrowling", return_tensors="pt")
+t5_tokenizer.decode(t5.generate(inputs_ids)[0]) 
+# >>> '<pad><extra_id_0>? Is it Harry Potter?</s>'
+```
+
+Load delta model from delta center:
+```python
+# use existing delta models
+from opendelta import AutoDeltaModel, AutoDeltaConfig
+# use existing delta models from DeltaCenter
+delta = AutoDeltaModel.from_finetuned("thunlp/Spelling_Correction_T5_LRAdapter_demo", backbone_model=t5)
+# freeze the whole backbone model except the delta models.
+delta.freeze_module()
+# visualize the change
+delta.log()
+
+t5_tokenizer.decode(t5.generate(inputs_ids)[0]) 
+# >>> <pad> Is Harry Potter written by JK Rowling?</s>
+```
+
+
--- a/OpenDelta-0.3.2/docs/source/notes/examples.md
+++ b/OpenDelta-0.3.2/docs/source/notes/examples.md
@ -0,0 +1,16 @@
+# Examples
+
+## examples_prompt
+|            | Lora | Bias<br>Tuning  | Adapter<br>Houstbly | Adapter<br>Preffier  | Adapter<br>Drop  | Adapater<br> Low-Rank   | Compactor  |Prefix<br> Tuning      | Prompt <br> Tuning |
+| --------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ----- | ----- | 
+| T5             | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  |
+| GPT-2          | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  |     |
+| BART           | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  |     | 
+| DistilBERT     | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  |     | 
+| RoBERTa        | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  |     |
+| BERT           | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  |
+| T5-3b(parallel)| ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  |
+| Deberta-v2     | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  |     |     |
+| CTRL           | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  | ✅  |     |     |
+
+## tutorials
--- a/OpenDelta-0.3.2/docs/source/notes/faq.md
+++ b/OpenDelta-0.3.2/docs/source/notes/faq.md
@ -0,0 +1,14 @@
+# FAQs
+
+1. **Why I encounder NotImplementedError in Prefix Tuning?**
+
+    This is because we find no easy way to get a unified Prefix Tuning implementation for different attention classes. If you really want to use Prefix Tuning for the models we have not supported, you can implement the ``PrefixLayerYOURMODEL`` on your own or raise a issue to request the feature for your model. 
+
+2. **Available Models with default configurations are ..., Please manually add the delta models by speicifying 'modified_modules' based on the visualization of your model structure**
+
+    Although most pre-trained models (PTMs) use the transformers archtecture, they are implemented differently. For example, the attention module in GPT2 and BERT is not only named differently, but also implemented in different ways. Common structure mapping mapps the different name conventions of different PTMs into a unified name convention. But there are many PTMs that we do not currently cover. But don't worry! For these models, you can figure out which modules should you modify by simply [visualizing the PTMs](visualization), and then specify the `modified modules` manually (See [name-based addressing](namebasedaddr)). 
+
+
+3. **Requires a dummy_inputs to be passed through the model to understand the dimensionality of each tensor in the computation graph. The {module.__class__.__name__} Class has no dummy_inputs, and automatically created dummy_inputs failed.**
+
+    The `dummy_inputs` can be any data that make `backbone_model.forward(**dummy_inputs)` succeed. Only the form and shape of the `dummy_inputs` matter. To set dummy_inputs for your model, please use: `setattr(backbone_model, 'dummy_inputs', some_dummy_inputs)` before initializing `{self.__class__.__name__}`.
--- a/OpenDelta-0.3.2/docs/source/notes/inspect.md
+++ b/OpenDelta-0.3.2/docs/source/notes/inspect.md
@ -0,0 +1,129 @@
+
+(visualization)=
+# Visualize the Parameters
+
+When OpenDelta makes modifications to a pretrained model (PTM), it is beneficial to know what your PTM looks like, especially the location of the parameters.
+
+- **Before** applying opendelta, you can know **how to specify your modifications in terms of key addressing**.
+- **After** the modification is done, you can know **if your modification is what you expected**, for example, whether the position of the delta 
+modules are desired, or whether you froze the correct parameters.
+
+Now let's begin to try the visualization utility.
+
+## Visualization is NOT easy using pytorch native function.
+
+```python
+from transformers import BertForMaskedLM
+backbone_model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+print(backbone_model)
+```
+
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/raw_print.png
+---
+width: 600px
+name: raw_print
+---
+```
+````
+
+The original presentation of models is **not tailored for repeated structures, big models, or parameters-centric tasks**.
+
+
+## Using visualization from bigmodelvis.
+
+First let's visualize all the parameters in the bert model. As we can see, structure inside a bert model, and the all the paramters location of the model are neatly represented in tree structure. (See [color scheme](color_schema) for the colors)
+
+```python
+from bigmodelvis import Visualization
+model_vis = Visualization(backbone_model)
+model_vis.structure_graph()
+```
+
+<!-- ````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span> -->
+```{figure} ../imgs/bert_vis.png
+---
+width: 600px
+name: bert_vis
+---
+```
+<!-- ```` -->
+
+
+<div class="admonition note">
+<p class="title">**Suggestion**</p>
+We can reference a module according to the graph easily:
+```python
+print(backbone_model.bert.encoder.layer[0].intermdiate)
+```
+When using opendelta on a new backbone model, it's better to first visualize the child module names (shown in white), and then designating the `modified_modules`.
+</div>
+
+
+
+
+## Now add a delta model and visualize the change. 
+
+
+```python
+from opendelta import LowRankAdapterModel
+delta_model = LowRankAdapterModel(backbone_model)
+delta_model.freeze_module(exclude=["cls", "intermediate", "LayerNorm"])
+Visualization(backbone_model).structure_graph()
+```
+
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/bertdelta_vis.png
+---
+width: 600px
+name: bertdelta_vis
+---
+```
+````
+
+(color_schema)=
+<div class="admonition tip">
+<div class="title">**Color Schema**</div>
+<ul>
+<li> The <span style="font-weight:bold;color:white;">white</span> part is the name of the module.</li>
+<li> The <span style="font-weight:bold;color:green;">green</span> part is the module's type.</li> 
+<li> The <span style="font-weight:bold;color:blue;">blue</span> part is the tunable parameters, i.e., the parameters that require grad computation.</li> 
+<li>  The <span style="font-weight:bold;color:grey;">grey</span>  part is the frozen parameters, i.e., the parameters that do not require grad computation.</li> 
+<li> The <span style="font-weight:bold;color:red;">red</span> part is the structure that is repeated and thus folded.</li> 
+<li> The <span style="font-weight:bold;color:purple;">purple</span> part is the delta parameters inserted into the backbone model.</li> 
+</ul>
+</div>
+
+:::{admonition} PlatForm Sentivity
+:class: warning
+Depending on the platform the code is running on, the colors may vary slightly.
+:::
+
+
+
+
+## We also provide the option to visualize the nodes without parameters.
+
+```python
+Visualization(backbone_model).structure_graph(keep_non_params=True)
+```
+
+Thus, the modules like dropout and activations are kept.
+
+
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/bertdelta_noparam.png
+---
+width: 600px
+name: bertdelta_noparam
+---
+```
+````
+
+:::{admonition} Order of the submodule
+:class: warning
+Currently, OpenDelta‘s Visualization visualize the model based on pytorch's named_modules method. That means the order of the presented submodule is the order they are add to the parent module, not necessarily the order that tensors flows through. 
+:::
+
+
+# Inspect the optimizer
--- a/OpenDelta-0.3.2/docs/source/notes/installation.md
+++ b/OpenDelta-0.3.2/docs/source/notes/installation.md
@ -0,0 +1,31 @@
+
+(installation)=
+# Installation
+
+
+The lasted version of OpenDelta is tested on on [Python 3.8](https://www.python.org/) and [Pytorch 1.12](<https://pytorch.org/>). Other versions are likely to be supported as well.
+
+
+## install the lastest version
+```bash
+pip install git+https://github.com/thunlp/OpenDelta.git
+```
+
+## install the lastest pip version (more stable)
+```bash
+pip install opendelta
+```
+
+## build from source
+```bash
+git clone git@github.com:thunlp/OpenDelta.git
+cd OpenDelta
+```
+then 
+```
+python setup.py install
+```
+or if you want to do some modifications on the code for your research:
+```
+python setup.py develop
+```
--- a/OpenDelta-0.3.2/docs/source/notes/keyfeature.md
+++ b/OpenDelta-0.3.2/docs/source/notes/keyfeature.md
@ -0,0 +1,68 @@
+(keyfeature)=
+# Philosophy and Key Features
+
+:::{admonition} Plug-and-play Design.
+:class: tip
+
+Existing open-source project to propogate this **''delta-tuning''** paradigm includes
+<a href="https://adapterhub.ml">AdapterHub</a>, which copies the transformers code base and modify on it, which makes it unintuitive to transfer from a normal code base to a delta-tuning ones.
+
+OpenDelta approaches this problem via a **true plug-and-play** fashion to the PLMs. To migrate from a full-model finetuning training scripts to a delta tuning training scripts, you **DO NOT**  need to change the backbone bone model code base to an adapted code base.
+:::
+
+
+Here is how we achieve it.
+
+<img src="../imgs/pointing-right-finger.png" height="30px"> **Read through it will also help you to implement your own delta models in a sustainable way.**
+
+
+## 1. Name-based submodule addressing.
+See [name based addressing](namebasedaddr)
+## 2. Three basic submodule-level delta operations.
+We use three key functions to achieve the modifications to the backbone model outside the backbone model's code.
+
+1. **unfreeze some paramters**
+
+   Some delta models will unfreeze a part of the model parameters and freeze other parts of the model, e.g. [BitFit](https://arxiv.org/abs/2106.10199). For these methods, just use [freeze_module](opendelta.basemodel.DeltaBase.freeze_module) method and pass the delta parts into `exclude`.
+   
+2. **replace an module**
+
+   Some delta models will replace a part of the model with a delta model, i.e., the hidden states will no longer go through the original submodules. This includes [Lora](https://arxiv.org/abs/2106.09685).
+   For these methods, we have an [update_module](opendelta.basemodel.DeltaBase.replace_module) interface.
+
+3. **insertion to the backbone**
+
+   - **sequential insertion**
+   
+    Most adapter model insert a new adapter layer after/before the original transformers blocks. For these methods, insert the adapter's forward function after/before the original layer's forward function using [insert_sequential_module](opendelta.basemodel.DeltaBase.insert_sequential_module) interface. 
+   - **parallel insertion**
+   
+    Adapters can also be used in a parallel fashion (see [Paper](https://arxiv.org/abs/2110.04366)).
+    For these methods, use [insert_parallel_module](opendelta.basemodel.DeltaBase.insert_parallel_module) interface.
+
+
+:::{admonition} Doc-preserving Insertion
+:class: note
+In the insertion operations, the replaced forward function will inherit the doc strings of the original functions. 
+:::
+
+## 3. Pseudo input to initialize.
+Some delta models, especially the ones that is newly introduced into the backbone, will need to determine the parameters' shape. To get the shape, we pass a pseudo input to the backbone model and determine the shape of each delta layer according to the need of smooth tensor flow. 
+
+:::{admonition} Pseudo Input
+:class: warning
+Most models in [Huggingface Transformers](https://huggingface.co/docs/transformers/index) have an attribute [dummy_inputs](https://github.com/huggingface/transformers/blob/v4.16.2/src/transformers/modeling_utils.py#L464). This will create a nonsensical input with the correct format to pass into the model's forward function.
+
+For the models that doesn't inherit/implement this attributes, we assume the pseudo input to the model is something like `input_id`, i.e., an integer tensor.
+```python
+pseudo_input = torch.tensor([[0,0,0]])
+# or 
+pseudo_input = torch.tensor([0,0,0])
+```
+<img src="../imgs/todo-icon.jpeg" height="30px"> We will add interface to allow more pseudo input in the future.
+:::
+
+
+
+
+
--- a/OpenDelta-0.3.2/docs/source/notes/namebasedaddr.md
+++ b/OpenDelta-0.3.2/docs/source/notes/namebasedaddr.md
@ -0,0 +1,185 @@
+
+# Name-based Addressing
+
+Named based addressing is what set OpenDelta apart from other packages and provide the possibility to be used to a broader range of models (even emerging ones).
+
+
+## Name of a submodule. 
+We locate the submodules that we want to apply a delta layer via name-based addressing.
+
+In pytorch fashion, a submodule can be accessed from a root model via 'dot' addressing. For example, we define a toy language model
+
+```python
+import torch.nn as nn
+class MyNet1(nn.Module):
+    def __init__(self,):
+        super().__init__()
+        self.name_a = nn.Linear(5,5)
+    def forward(self, hiddens):
+        return self.name_a(hiddens)
+
+class MyNet2(nn.Module):
+    def __init__(self,):
+        super().__init__()
+        self.embedding = nn.Embedding(10,5)
+        self.name_b = nn.Sequential(MyNet1(), MyNet1())
+    def forward(self, input_ids):
+        hiddens = self.embedding(input_ids)
+        return self.name_b(hiddens)
+        
+root = MyNet2()
+print(root.name_b[0].name_a)
+# Linear(in_features=5, out_features=5, bias=True)
+```
+
+We can visualize the model (For details, see [visualization](visualization))
+
+```python
+from bigmodelvis import Visualization
+Visualization(root).structure_graph()
+```
+
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/name_based_addressing.png
+---
+width: 500px
+name: name_based_addressing
+---
+```
+````
+
+In this case, string `"name_b.0.name_a"` will be the name to address the submodule from the root model. 
+
+Thus when applying a delta model to this toy net.
+
+```python
+from opendelta import AdapterModel
+AdapterModel(backbone_model=root, modified_modules=['name_b.0.name_a'])
+Visualization(root).structure_graph()
+```
+
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/toy-delta.png
+---
+width: 500px
+name: toy-delta
+---
+```
+````
+
+(targetmodules)=
+## Target modules.
+
+For different delta methods, the operation for the modification target is different.
+- Adapter based method: Insert at the target module's forward function.
+- BitFit: Add bias to all allowed position of the target module.
+- Lora: Substitute the all the linear layers of the target module with [Lora.Linear](https://github.com/microsoft/LoRA/blob/main/loralib/layers.py#L92).
+- Prefix Tuning: the target module must be an attention module. 
+
+:::{admonition} Auto Searching
+:class: note
+We are working on unifying operations to automatically search within a given module for its submodules that can be applied using a specific delta method.
+:::
+
+## Makes addressing easier.
+
+Handcrafting the full names of submodules can be frustrating. We made some simplifications
+
+1. **End-matching** Rules.
+
+    OpenDelta will take every modules that 
+    **ends with** the provided name suffix as the modification [target module](targetmodules). 
+    :::{admonition} Example
+    :class: tip
+    Taking DistilBert with an classifier on top as an example:
+    - set to `["0.attention.out_lin"]` will add delta modules to the attention output of distilbert's 
+    ayer 0, i.e., `distilbert.transformer.layer.0.attention.out_lin`.
+    - set to `["attention.out_lin"]` will add the delta modules in every layer's `attention.out_lin`. 
+    :::
+
+
+(regexexpr)=
+2. Regular Expression.
+
+    We also support regex end-matching rules. 
+    We use a beginning `[r]` followed by a regular expression to represent this rule, where `[r]` is used to distinguish it from normal string matching  rules and has no other meanings.
+
+    Taking RoBERTa with an classifier on top as an example: It has two modules named `roberta.encoder.layer.0.attention.output.dense` and `roberta.encoder.layer.0.output.dense`, which both end up with `output.dense`. To distinguish them:
+
+    - set `'[r](\d)+\.output.dense'` using regex rules, where `(\d)+` match any layer numbers. This rule will match all `roberta.encoder.layer.$.output.dense`. where `$` represents all integer numbers, here in a 12-layer RoBERTa, it's 0-11.
+
+    - set `'[r][0-5]\.attention'` will match only the 0-5 layers' attention submodule. 
+
+    - set `'attention.output.dense'` using ordinary rules, which only match `roberta.encoder.layer.0.attention.output.dense`.
+    
+    :::{admonition} Regex in Json Configs 
+    :class: warning
+    In json, you should write `"\\."` instead of `"\."` for a real dot due to json parsing rules. That is 
+    ```
+    {   
+        ...
+        "modified_moduls": ['[r][0-5]\\.attention'],
+        ...
+    }
+    ```
+    :::
+
+
+3. Interactive Selection.
+
+    We provide a way to interact visually to select modules needed.
+
+    ```python
+    from transformers import BertForMaskedLM
+    model = BertForMaskedLM.from_pretrained("bert-base-cased")
+    # suppose we load BERT
+
+    from opendelta import LoraModel # use lora as an example, others are same
+    delta_model = LoraModel(backbone_model=model, interactive_modify=True)
+    ```
+
+    by setting `interactive_modify`, a web server will be opened on local host, and the link will be print in the terminal, e.g.,
+
+    ```
+    http://0.0.0.0:8888/
+    ```
+
+    If on your local machine, click to open the link for interactive modification.
+
+    If on remote host, you could use port mapping. For example, vscode terminal will automatically do port mapping for you, you can simply use `control/command + click` to open the link.
+
+    You can change the port number in case the default port number is occupied by other program by setting `interactive_modify=port_number`, in which port_number is an integer.
+
+    The web page looks like the following figure.
+
+    ```{figure} ../imgs/interact.jpg
+    ---
+    width: 500px
+    name: interact web page
+    ---
+    ```
+
+    - By clicking on `[+]`/`[-]` to expand / collapse tree nodes.
+
+    - By clicking on text to select tree nodes, **yellow dotted** box indicates the selection.
+
+    - **Double** click on the pink `[*]` is an advanced option to unfold the repeated nodes. By default, modules with the same architecture are folded into one node and are marked in red, for example, the `BertLayer` of layers 0~11 in the above figure are in the same structure. Regular model changes will make the same changes to each layers.
+    
+        - If you want to change only a few of them, first double-click on `[*]`, then select the parts you want in the unfolded structure.
+        
+        - If you want to make the same change to all but a few of them, first select the common parts you want in the folded structure, then double-click on `[*]` to remove the few positions you don't need to change in the expanded structure.
+
+    Click `submit` button on the top-right corner, then go back to your terminal, you can get a list of name-based addresses printed in the terminal in the following format, and these modules are being "delta".
+
+    ```
+    modified_modules:
+    [bert.encoder.layer.0.output.dense, ..., bert.encoder.layer.11.output.dense]
+    ```
+
+
+## Examples
+Nothing works better than a few lively examples.
+Comming Soon...
+
+
+
--- a/OpenDelta-0.3.2/docs/source/notes/overview.md
+++ b/OpenDelta-0.3.2/docs/source/notes/overview.md
@ -0,0 +1,36 @@
+# What is Delta-tuning and Why OpenDelta?
+
+(WhatisDelta)=
+:::{admonition} What is Delta?
+:class: tip
+
+As Pre-trained language models (PLMs) have become the fundamental infrastructure on many NLP tasks and benchmarks, it is becoming increasingly clear from recent research that **larger models tend to lead to better performance**. However, large-scale PLMs also bring prohibitive adaptation costs when fine-tuning all the parameters of a model and retaining separate instances for different tasks.
+
+**Parameter-efficient model stimulation methods** thus have attracted researchers' eyes, which only tune a small fraction of model parameter while achieving comparable or even better performance than full-model fine-tuning, dubbed as "Delta-tuning".
+
+**Delta** thus means a small fraction $\Delta\Theta$  of parameters besides the pretrained models $\Theta_0$. 
+
+\begin{gather*}
+\Theta \sim \Theta_0\text{(frozen)} + \Delta\Theta\text{(tunable)}
+\end{gather*}
+
+This open-source project implement several delta-tuning methods, which allows researchers and engineers to quickly migrate their codes from full-model tuning to delta-tuning without replace the backend (the implementation of the backbone PLM).
+:::
+
+
+
+## Why OpenDelta?
+
+- <span style="color:rgb(81, 217, 245);font-weight:bold">Clean:</span> No need to edit the backbone PTM’s codes.
+- <span style="color:orange;font-weight:bold">Simple:</span> Migrating from full-model tuning to delta-tuning needs as little as 3 lines of codes.
+- <span style="color:green;font-weight:bold">Sustainable:</span> Most evolution in external library doesn’t require a new OpenDelta.
+- <span style="color:red;font-weight:bold">Extendable:</span> Various PTMs can share the same delta-tuning codes.
+- <span style="color:purple;font-weight:bold">Flexible:</span> Able to apply delta-tuning to (almost) any position of the PTMs.
+
+
+## Delta-tuning papers
+<img src="../imgs/todo-icon.jpeg" height="30px">
+
+
+
+
--- a/OpenDelta-0.3.2/docs/source/notes/pluginunplug.md
+++ b/OpenDelta-0.3.2/docs/source/notes/pluginunplug.md
@ -0,0 +1,113 @@
+# Multitask Modeling using OpenDelta
+
+:::{admonition} Multitask Serving with Delta-tuning
+:class: tip
+A huge advange of Delta-tuning is that it can be used for multitask serving.
+Imagine we have a pretrained model trained on a mix of data coming from  multiple languages, e.g.,English, Chinese, and French. Now you want to have seperate models that specialise in Chinese, French, English. We can thus delta-tune three deltas on each language with small amount of additional language-specific data. During serving, when a Chinese sentence comes, you attach the "Chinese Delta", and next a French sentence comes, you detach the "Chinese Delta", and attach a "French Delta".  
+:::
+
+**Here is how to achieve multitask serving using OpenDelta.**
+
+```python
+from transformers import AutoModelForSequenceClassification
+model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base")
+from opendelta import LoraModel
+delta_model = LoraModel(backbone_model=model, modified_modules=['fc2'])
+delta_model.log()
+```
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/plugunplug1.png
+---
+width: 800px
+name: plugunplug1
+---
+```
+````
+
+Now we detach the deltas from the backbone
+```python
+delta_model.detach()
+delta_model.log()
+```
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/plugunplug2.png
+---
+width: 800px
+name: plugunplug2
+---
+```
+````
+
+We can reattach the deltas to the backbone
+```python
+delta_model.attach()
+delta_model.log()
+```
+
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/plugunplug3.png
+---
+width: 800px
+name: plugunplug3
+---
+```
+````
+
+:::{admonition} Independence of Different Delta Models
+:class: note
+Different delta models will be independent in detaching and attaching.
+(But the visualization will not show all deltas in the backbone model.)
+```python
+# continue from the above example
+from opendelta import AdapterModel
+delta_model2 = AdapterModel(backbone_model=model, modified_modules=['fc1'])
+delta_model2.log()
+```
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/plugunplug4.png
+---
+width: 800px
+name: plugunplug4
+---
+```
+````
+
+detach the lora delta
+```python
+delta_model.detach() # detach the lora delta
+delta_model.log()
+```
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/plugunplug5.png
+---
+width: 800px
+name: plugunplug5
+---
+```
+````
+
+detach the adapter delta and reattach the lora delta
+```python
+delta_model2.detach() # detach the adapter delta
+delta_model.attach() # reattach the lora delta
+delta_model.log()
+```
+````{collapse} <span style="color:rgb(141, 99, 224);font-weight:bold;font-style:italic">Click to view output</span>
+```{figure} ../imgs/plugunplug6.png
+---
+width: 800px
+name: plugunplug6
+---
+```
+````
+:::
+
+
+:::{admonition} BitFit not supported
+:class: warning
+<img src="../imgs/todo-icon.jpeg" height="30px"> Currently detach is not suitable for BitFit, which modify the requires_grad property. Please wait for future releases. 
+:::
+
+
+
+
--- a/OpenDelta-0.3.2/docs/source/notes/quickstart.md
+++ b/OpenDelta-0.3.2/docs/source/notes/quickstart.md
@ -0,0 +1,38 @@
+(basics)=
+# Quick Start
+Now we introduce the most basic interface to migrate your full-model tuning scripts to a delta tuning one **on some commonly used PTMs or their derivative models** (the models that has the PTM as their submodule,e.g., BERTForSequenceClassification). [try in colab](https://colab.research.google.com/drive/1SB6W5B-2nKxOnkwHSIe3oGXZ7m53u_Vf?usp=sharing)
+
+```diff
+  from transformers import AutoModelForSequenceClassification
+  model = AutoModelForSequenceClassification.from_pretrained("bert-large-cased")
+  
+ from opendelta import AdapterModel
+ delta_model = AdapterModel(model)
+ delta_model.freeze_module(exclude=["deltas", "classifier"]) # leave the delta tuning modules and the newly initialized classification head tunable.
+ # delta_model.log() # optional: to visualize how the `model` changes. 
+
+  training_dataloader = get_dataloader()
+  optimizer, loss_function = get_optimizer_loss_function()
+  for batch in training_dataloader:
+      optimizer.zero_grad()
+      targets = batch.pop('labels')
+      outputs = model(**batch).logits
+      loss = loss_function(outputs, targets)
+      loss.backward()
+      optimizer.step()
+      print(loss)
+
+- torch.save(model.state_dict(), "finetuned_bert.ckpt")
+ delta_model.save_finetuned("finetuned_bert")
+```
+
+We currently support the following models and their derivative models in their default configurations.
+
+- BERT
+- DeBERTa-v2
+- GPT2
+- OPT
+- RoBERTa
+- T5
+
+For model not in the above list, please refer to more detailed [custom usage](custom).
--- a/OpenDelta-0.3.2/docs/source/notes/unifyname.md
+++ b/OpenDelta-0.3.2/docs/source/notes/unifyname.md
@ -0,0 +1,82 @@
+(commonstructure)=
+
+# Common Structure Mapping
+
+```{figure} ../imgs/transformers_structure.png
+:width: 400px
+:name: transformers_structure
+```
+
+Although different PTMs often share similar Transformers structures, the codebases, and most importantly, the variable names for each submodule, are quite different.
+
+
+
+On the one hand, we **encourage the users to first [visualize](visualization) the PTMs' structure and then determine the name of submoduels.**
+
+On the other hand, we designed a unified name convention of Transformer Structure, and provided several structure mapping from the original name to the unified name convention. 
+
+In this section, we will illustrate the unified name convention and structure mapping.
+
+
+## Common blocks in Transformers structure.
+
+
+- embeddings (word embedding)
+- encoder
+  - block
+    - $ (layer_id)
+      - attn
+        - q, k, v
+        - proj
+        - layer_norm
+      - ff
+        - w1
+        - w2
+        - layer_norm
+- decoder (similar to encoder)
+- lm_head
+  - proj
+
+Visualize bert-base using a common structure name: The submodules that are not common are grey.
+
+```{figure} ../imgs/commonstructure_vis.png
+:width: 600px
+:name: commonstructure_vis
+```
+
+(mappingexample)=
+## Example
+
+Example of bert mapping: a tree with node names specified by <span style="font-weight:bold;color:rgb(55, 125, 34);" >"\_\_name\_\_"</span>
+```json
+{
+    "bert.embeddings.word_embeddings": {"__name__":"embeddings"},
+    "bert.embeddings.position_embeddings": {"__name__":""},
+    "bert.embeddings.token_type_embeddings": {"__name__":""},
+    "bert.embeddings.LayerNorm": {"__name__":""},
+    "bert.encoder": {"__name__":"encoder",
+        "layer": {"__name__":"block",
+            "$": {"__name__":"$",
+                "attention": {"__name__":"attn",
+                    "self.query": {"__name__":"q"},
+                    "self.key": {"__name__":"k"},
+                    "self.value": {"__name__":"v"},
+                    "output.dense": {"__name__":"proj"},
+                    "output.LayerNorm": {"__name__":"layer_norm"},
+                },
+                "output": {"__name__":"ff",
+                            "dense": {"__name__":"w2"},
+                            "LayerNorm": {"__name__":"layer_norm"}
+                },
+                "intermediate.dense": {"__name__":"ff.w1"},
+            }
+        }
+    },
+    "cls.predictions": {"__name__": "lm_head",
+        "transform.dense": {"__name__":""},
+        "transform.LayerNorm": {"__name__":""},
+        "decoder": {"__name__":"proj"},
+    }
+}
+```
+
--- a/OpenDelta-0.3.2/docs/source/notes/update.md
+++ b/OpenDelta-0.3.2/docs/source/notes/update.md
@ -0,0 +1,35 @@
+# Update Logs and Known Issues
+
+## Version 0.3.2
+- We improve the docs.
+- We support BMTrain to accelerate the training, and parallelize the training of models that are hard to fit in a single GPU. Check [tutorial/2_with_bmtrain.py](https://github.com/thunlp/OpenDelta/tree/main/examples/tutorial/2_with_bmtrain.py)
+- We add a functionality to [inspect the optimizer](https://github.com/thunlp/OpenDelta/tree/main/opendelta/utils/inspect.py). The user can see the number of trainable parameters in the optimizer and verify that opendelta is being used correctly.
+- We move the functions to inspect the delta models into [inspect.py](https://github.com/thunlp/OpenDelta/tree/main/opendelta/utils/inspect.py)
+
+## Version 0.3.1
+- We update [must_try.py](https://github.com/thunlp/OpenDelta/tree/main/examples/unittest/must_try.py) for a simple introduction of the core functionality of OpenDelta.
+- Thanks to [Weilin Zhao](https://github.com/Achazwl) We merge a long-developed branch parallel_adapter into the main branch.
+
+
+## Version 0.3.0
+### Updates:
+- Add this changelog for a granular record of updates.
+- The default configuration of delta models can be applied to more wrapped models.
+  - There is less need to configure 'modified_modules' for wrapped models like [BertForSequenceClassification](https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertForSequenceClassification) or even [OpenMatch.DRModel](https://github.com/OpenMatch/OpenMatch/blob/master/src/openmatch/modeling/dense_retrieval_model.py#L37), as long as it has a model we support default configuration inside. **Note that if you customize `modified_modules` by yourself, most pytorch models are supported.**
+- LoRA and BitFit models now does not need pseudo data to instantiate the model.
+- BitFit models can now support [Conv1D](https://huggingface.co/docs/transformers/v4.23.1/en/internal/modeling_utils#transformers.Conv1D) using default configuration.
+- Improve type hint for AutoDeltaModel.
+- Fix bugs in documentation.
+- Fix small bugs when saving a model without a config attributes.
+- Make the default modified modules of adapter-like methods more accurate: attach the adapter-like modules after the output of attention layer and second feed-forward layer, both before the layernorm layers. 
+- A simple unit test folder containing development-time tests has been added for interested users.
+
+
+### Known Issues
+- SoftPrompt is still not supported for wrapped model if the model has no attribute `get_input_embeddings`.
+- Prefix Tuning is still limited to T5, GPT2, Bart, Bert, Roberta.
+
+## Version 0.2.4
+### Updates
+- examples/examples_seq2seq and examples/examples_text-classification is depreciated and moved to [legacy](https://github.com/thunlp/OpenDelta/tree/main/examples/legacies)
+- Thanks to [Zhen Zhang](https://github.com/namezhenzhang),  we provide [examples_prompt](https://github.com/thunlp/OpenDelta/tree/main/examples/examples_prompt), as a cleaner and more general framework, which unifies the delta tuning paradigm and the prompt-tuning paradigm. It is still based on [Huggingface Trainers](https://huggingface.co/docs/transformers/main_classes/trainer). In this example framework, the running pipeline is [a unified script](https://github.com/thunlp/OpenDelta/tree/main/examples/examples_prompt/src), the differences in tasks, models, delta tuning models, and even prompt-tuning paradigms are [more modular and be more independent ](https://github.com/thunlp/OpenDelta/tree/main/examples/examples_prompt/backbones). Please try it out!
--- a/OpenDelta-0.3.2/docs/source/notes/withaccelerate.md
+++ b/OpenDelta-0.3.2/docs/source/notes/withaccelerate.md
@ -0,0 +1,3 @@
+
+# OpenDelta + Huggingface Accelerate
+<img src="../imgs/todo-icon.jpeg" height="30px"> 
--- a/OpenDelta-0.3.2/docs/source/notes/withbmtrain.md
+++ b/OpenDelta-0.3.2/docs/source/notes/withbmtrain.md
@ -0,0 +1,12 @@
+
+(acceleration)=
+# OpenDelta + BMTrain
+
+- [BMTrain](https://github.com/OpenBMB/BMTrain) is an efficient large model training toolkit that can be used to train large models with tens of billions of parameters. It can train models in a distributed manner while keeping the code as simple as stand-alone training.
+- [ModelCenter](https://github.com/OpenBMB/ModelCenter) implements pre-trained language models (PLMs) based on the backend OpenBMB/BMTrain. ModelCenter supports Efficient, Low-Resource, Extendable model usage and distributed training.
+
+Now we have the LoraModel, AdapterModel, CompacterModel, ParallelAdapterModel, LowRankAdapterModel fully supported the distributed training with BMTrain and ModelCenter. 
+
+Pass `backend='bmt'` in config or delta model initialization to enable `bmtrain`.
+
+
--- a/OpenDelta-0.3.2/examples/README.md
+++ b/OpenDelta-0.3.2/examples/README.md
@ -0,0 +1,25 @@
+# Use Examples
+
+This repo mainly contains several running scripts to use OpenDelta to conduct parameter-efficient training of various tasks.
+
+**Note that we suggest adding OpenDelta to existing scripts, instead of modify a scripts into the following examples. OpenDelta itself doens't restrict the training pipeline nor provide pipeline.**
+
+
+## tutorial
+Several toy tutorials:
+1. The scripts for docs/basic_usage
+2. Using interactive module selection
+3. Work with [OpenPrompt](https://github.com/thunlp/OpenPrompt)
+
+## examples_text-classification
+Modify a huggingface text-classification examples into a delta tuning one.
+Currently, GLUE datasets are supported in the scripts. Roberta-base is used for performance checking. Read README.md inside the repo for detailed usage.
+
+## examples_seq2seq
+Modify a huggingface sequence to sequence examples into a delta tuning one.
+Currently, SuperGLUE and GLUE datasets are supported in the scripts. T5-base is used for performance checking. Read README.md inside the repo for detailed usage.
+
+
+## examples_image-classification
+A toy example of using OpenDelta for a Computer Vision Pretrained Model (ViT). Since ViT is an experimental feature in huggingface transformers, this example is subject to Change at any moment. 
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/README.md
+++ b/OpenDelta-0.3.2/examples/examples_prompt/README.md
@ -0,0 +1,59 @@
+# Examples of using opendelta together with 🤗 transformers.
+
+In this repo, we construct a very general pipeline to train and test a PLM using
+🤗 transformers.
+
+The pipeline was constructed together with [openpromptu](https://pypi.org/project/openpromptu/), which is a light and
+model-agnostic version of [openprompt](https://github.com/thunlp/OpenPrompt).
+
+## Pool of PLMs
+We are going to adapt most of the models in 🤗 transformers
+in the repos. The different pipeline, processing, or configurations are specified
+in `./backbones/`. You can add your own model in this file to support customized models.
+
+
+### A example script to run the repo in offline mode
+```bash
+conda activate [YOURENV]
+PATHBASE=[YOURPATH]
+
+JOBNAME="adapter_t5-base"
+DATASET="superglue-cb"
+
+cd $PATHBASE/OpenDelta/examples/examples_prompt/
+python configs/gen_t5.py --job $JOBNAME
+
+export TRANSFORMERS_OFFLINE=1
+export HF_DATASETS_OFFLINE=1
+python src/run.py configs/$JOBNAME/$DATASET.json \
+--model_name_or_path [YOURPATH_TO_T5_BASE] \
+--tokenizer_name [YOURPATH_TO_T5_BASE] \
+--datasets_saved_path [YOURPATH_TO_CB_DATASETS] \
+--finetuned_delta_path ${PATHBASE}/delta_checkpoints/ \
+--num_train_epochs 20 \
+--bottleneck_dim 24 \
+--delay_push True
+```
+
+## A example of quick testing the repo.
+
+```bash
+conda activate [YOURENV]
+PATHBASE=[YOURPATH]
+
+JOBNAME="adapter_t5-base"
+DATASET="superglue-cb"
+
+cd $PATHBASE/OpenDelta/examples/examples_prompt/
+
+export TRANSFORMERS_OFFLINE=1
+export HF_DATASETS_OFFLINE=1
+export DELTACENTER_OFFLINE=0
+python src/test.py configs/$JOBNAME/$DATASET.json \
+--model_name_or_path [YOURPATH_TO_T5_BASE] \
+--tokenizer_name [YOURPATH_TO_T5_BASE] \
+--datasets_saved_path [YOURPATH_TO_CB_DATASETS] \
+--finetuned_delta_path thunlp/t5-base_adapter_superglue-cb_20220701171436c80 \
+--delta_cache_dir "./delta_checkpoints/" \
+--force_download True
+```
--- a/OpenDelta-0.3.2/examples/examples_prompt/init.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/init.py
--- a/OpenDelta-0.3.2/examples/examples_prompt/backbones/bart.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/bart.py
@ -0,0 +1,179 @@
+
+from openpromptu.data_utils import InputExample
+from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
+from transformers import (
+    AutoConfig,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+)
+from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
+import torch
+
+def mask_token_func(tokenizer, ith_mask=0):
+    return tokenizer.mask_token
+
+def get_remove_columns(dataset_features):
+    return dataset_features
+
+def preprocess_function(raw_example, **kwargs):
+    # max_target_length += 1
+    tokenizer = kwargs['tokenizer']
+    data_args = kwargs['data_args']
+    template = kwargs['template']
+    verbalizer = kwargs['verbalizer']
+    tokenizer_wrapper = kwargs['tokenizer_wrapper']
+    split = kwargs['split']
+    example = InputExample(**raw_example)
+
+
+
+    example = verbalizer.wrap_one_example(example)
+    example, other = template.wrap_one_example(example)
+    input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
+    model_inputs = tokenizer(input_sentence, max_length=256,
+                        padding="max_length", truncation=True)
+
+
+
+    with tokenizer.as_target_tokenizer():
+        label = tokenizer(other['tgt_text']).input_ids
+
+    model_inputs["labels"] = label
+    return model_inputs
+
+def get_backbone(model_args, **kwargs):
+    config = AutoConfig.from_pretrained(
+        # model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    config.dropout_rate = 0.0
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+
+    model = AutoModelForSeq2SeqLM.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        )
+    return config, tokenizer, model
+
+
+def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
+    from openpromptu.prompts import GenerationVerbalizer
+    from openpromptu.prompts import ManualTemplate
+    from openpromptu import TokenizerWrapper
+    template = ManualTemplate(text = task.templates_text[template_id])
+    verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
+    tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
+    return template, verbalizer, tokenizer_wrapper
+
+class Trainer(HfSeq2SeqTrainer):
+    def __init__(self, verbalizer=None, eval_task=None, **kwargs):
+        super().__init__(**kwargs)
+        self.eval_task = eval_task
+        self.compute_metrics = self._compute_metrics
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        outputs = model(**inputs)
+        if return_outputs:
+            return (outputs.loss, outputs)
+        else:
+            return outputs.loss
+
+    def prediction_step(
+        self,
+        model, #nn.Module,
+        inputs, #Dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only, #: bool,
+        ignore_keys, #: Optional[List[str]] = None,
+    ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform an evaluation step on :obj:`model` using obj:`inputs`.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (:obj:`nn.Module`):
+                The model to evaluate.
+            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (:obj:`bool`):
+                Whether or not to return the loss only.
+
+        Return:
+            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
+            labels (each being optional).
+        """
+        if not self.args.predict_with_generate or prediction_loss_only:
+            return super().prediction_step(
+                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
+            )
+
+
+        has_labels = "labels" in inputs
+        inputs = self._prepare_inputs(inputs)
+        gen_kwargs = {
+            "max_length": 10, # self._max_length if s is not None else self.model.config.max_length,
+            "num_beams": 1 #self._num_beams if self._num_beams is not None else self.model.config.num_beams,
+        }
+        generated_tokens = self.model.generate(
+            inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            **gen_kwargs,
+        )
+        # in case the batch is shorter than max length, the output should be padded
+        if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
+
+        with torch.no_grad():
+
+            outputs = model(**inputs)
+            if has_labels:
+                if self.label_smoother is not None:
+                    loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
+                else:
+                    loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
+            else:
+                loss = None
+
+        if self.args.prediction_loss_only:
+            return (loss, None, None)
+
+        labels = inputs["labels"]
+        if labels.shape[-1] < gen_kwargs["max_length"]:
+            labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
+
+        # from IPython import embed; embed(header="In seqseqtrainer")
+        return (loss, generated_tokens, labels)
+
+    def _compute_metrics(self, eval_preds):
+        # from IPython import embed; embed(header="In compute metrics")
+        preds, labels = eval_preds
+        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
+        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
+        # post_processor = .get(data_args.dataset_name[0], tokenizer,
+        #                                     data_args.ignore_pad_token_for_loss)
+        # decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info)
+        result = {}
+        for metric in self.eval_task.metric:
+            result.update(metric(decoded_preds, decoded_labels))
+
+        average_metric = sum(result.values())/len(result)
+        result.update({"average_metrics":average_metric})
+        return result
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/backbones/beit.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/beit.py
@ -0,0 +1,140 @@
+from openpromptu.data_utils import InputExample
+import torch
+from transformers.data.data_collator import torch_default_data_collator
+from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
+import numpy as np
+from transformers import (
+    AutoConfig,
+    AutoFeatureExtractor,
+    AutoModelForImageClassification,
+)
+
+from transformers import Trainer as HfTrainer
+import torch.nn as nn
+
+
+def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
+    # from openpromptu.prompts import ManualVerbalizer
+    # from openpromptu.prompts import ManualTemplate
+    # from openpromptu import TokenizerWrapper
+    # template = ManualTemplate(text = task.templates_text[template_id])
+    # verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
+    # tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
+    return None, None, None
+
+def preprocess_function(raw_example, **kwargs):
+    # from IPython import embed; embed(header="Therefa")
+    tokenizer = kwargs['tokenizer']
+    # print(np.array(raw_example['img']).shape)
+    model_inputs = tokenizer(np.array(raw_example['image']), return_tensors='pt')
+    model_inputs['pixel_values'] = model_inputs['pixel_values'].squeeze()
+    model_inputs['labels'] = raw_example['label']
+    return model_inputs
+
+def compute_metrics(eval_preds, dataset_name, eval_metric):
+    # from IPython import embed; embed(header="In compute metrics")
+
+    preds, labels = eval_preds.predictions, eval_preds.label_ids
+
+    preds = np.argmax(preds, axis=-1)
+
+    result = {}
+    average_metrics = []
+    for metric in eval_metric:
+        metric_item = metric(preds, labels)
+        metric_value =  list(metric_item.values())
+        result.update(metric_item)
+        average_metrics.extend(metric_value)
+    print("average:",average_metrics)
+    average_metric = sum(average_metrics)/len(average_metrics)
+    result.update({"average_metrics":average_metric})
+    return result
+
+def mask_token_func(tokenizer, ith_mask=0):
+    return tokenizer.mask_token
+
+def get_remove_columns(dataset_features):
+    # dataset_features.pop("label")
+    # print("remove_columns: {}".format(dataset_features))
+    return dataset_features
+
+class DataCollator(HfDataCollatorMixin):
+    def __init__(self, *args, **kwargs):
+        self.return_tensors='pt'
+
+    def torch_call(self, features):
+        # from IPython import embed; embed(header="in data collator")
+        a = torch_default_data_collator(features=features)
+        # from IPython import embed; embed(header="in data collator")
+        return a
+
+
+def get_backbone(model_args, **kwargs):
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    config.dropout_rate = 0.0
+    tokenizer = AutoFeatureExtractor.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    model = AutoModelForImageClassification.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    config.num_labels = model_args.num_classes
+    old_classifier = model.classifier
+    model.classifier = nn.Linear(old_classifier.in_features, config.num_labels)
+
+
+    return config, tokenizer, model
+
+class Trainer(HfTrainer):
+    def __init__(self, verbalizer=None, eval_task=None, **kwargs):
+        super().__init__(**kwargs)
+        self.verbalizer=verbalizer
+        self.eval_task=eval_task
+        self.compute_metrics = self._compute_metrics
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        labels = inputs.pop('labels')
+        outputs = model(**inputs)
+        logits = outputs.get("logits")
+
+        loss = self.loss_fn(logits, labels)
+        return (loss, outputs) if return_outputs else loss
+
+    def _compute_metrics(self, eval_preds):
+        # from IPython import embed; embed(header="In compute metrics")
+
+        preds, labels = eval_preds.predictions, eval_preds.label_ids
+
+        preds = np.argmax(preds, axis=-1)
+
+        result = {}
+        average_metrics = []
+        for metric in self.eval_task.metric:
+            metric_item = metric(preds, labels)
+            metric_value =  list(metric_item.values())
+            result.update(metric_item)
+            average_metrics.extend(metric_value)
+        print("average:",average_metrics)
+        average_metric = sum(average_metrics)/len(average_metrics)
+        result.update({"average_metrics":average_metric})
+        from IPython import embed; embed(header="In compute metrics")
+        return result
+
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/backbones/bert.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/bert.py
@ -0,0 +1,142 @@
+from openpromptu.data_utils import InputExample
+import torch
+from transformers.data.data_collator import torch_default_data_collator
+from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
+import numpy as np
+from transformers import (
+    AutoConfig,
+    AutoModelForMaskedLM,
+    AutoTokenizer,
+)
+
+from transformers import Trainer as HfTrainer
+
+
+def preprocess_function(raw_example, **kwargs):
+    tokenizer = kwargs['tokenizer']
+    data_args = kwargs['data_args']
+    template = kwargs['template']
+    verbalizer = kwargs['verbalizer']
+    tokenizer_wrapper = kwargs['tokenizer_wrapper']
+
+    example = InputExample(**raw_example)
+    example, other = template.wrap_one_example(example)
+    input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
+    model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
+                        padding="max_length", truncation=True)
+    return model_inputs
+
+def compute_metrics(eval_preds, dataset_name, eval_metric):
+    # from IPython import embed; embed(header="In compute metrics")
+
+    preds, labels = eval_preds.predictions, eval_preds.label_ids
+
+    preds = np.argmax(preds, axis=-1)
+
+    result = {}
+    average_metrics = []
+    for metric in eval_metric:
+        metric_item = metric(preds, labels)
+        metric_value =  list(metric_item.values())
+        result.update(metric_item)
+        average_metrics.extend(metric_value)
+    print("average:",average_metrics)
+    average_metric = sum(average_metrics)/len(average_metrics)
+    result.update({"average_metrics":average_metric})
+    return result
+
+def mask_token_func(tokenizer, ith_mask=0):
+    return tokenizer.mask_token
+
+def get_remove_columns(dataset_features):
+    dataset_features.remove("label")
+    return dataset_features
+
+
+def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
+    from openpromptu.prompts import ManualVerbalizer
+    from openpromptu.prompts import ManualTemplate
+    from openpromptu import TokenizerWrapper
+    template = ManualTemplate(text = task.templates_text[template_id])
+    verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
+    tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
+    # from IPython import embed; embed()
+    return template, verbalizer, tokenizer_wrapper
+
+class DataCollator(HfDataCollatorMixin):
+    def __init__(self, *args, **kwargs):
+        self.return_tensors='pt'
+
+    def torch_call(self, features):
+        return torch_default_data_collator(features=features)
+
+
+
+def get_backbone(model_args, **kwargs):
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    config.dropout_rate = 0.0
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    model = AutoModelForMaskedLM.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    model.resize_token_embeddings(len(tokenizer))
+    return config, tokenizer, model
+
+class Trainer(HfTrainer):
+    def __init__(self, verbalizer=None, eval_task=None, **kwargs):
+        super().__init__(**kwargs)
+        self.verbalizer=verbalizer
+        self.eval_task=eval_task
+        self.compute_metrics = self._compute_metrics
+
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        labels = inputs.pop('labels')
+        outputs = model(**inputs)
+        logits = outputs.get("logits")
+        input_ids = inputs['input_ids']
+        verbalizer = self.verbalizer.cuda()
+        logits_at_mask = logits[torch.where(input_ids == verbalizer.tokenizer.mask_token_id)]
+        label_logits = verbalizer.process_logits(logits_at_mask)
+        loss_fct = torch.nn.CrossEntropyLoss()
+        loss = loss_fct(label_logits, labels)
+        outputs.logits = label_logits
+        return (loss, outputs) if return_outputs else loss
+
+    def _compute_metrics(self, eval_preds):
+        # from IPython import embed; embed(header="In compute metrics")
+
+        preds, labels = eval_preds.predictions, eval_preds.label_ids
+
+        preds = np.argmax(preds, axis=-1)
+
+        result = {}
+        average_metrics = []
+        for metric in self.eval_task.metric:
+            metric_item = metric(preds, labels)
+            metric_value =  list(metric_item.values())
+            result.update(metric_item)
+            average_metrics.extend(metric_value)
+        print("average:",average_metrics)
+        average_metric = sum(average_metrics)/len(average_metrics)
+        result.update({"average_metrics":average_metric})
+        return result
+
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/backbones/bigbird.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/bigbird.py
@ -0,0 +1,143 @@
+from openpromptu.data_utils import InputExample
+import torch
+from transformers.data.data_collator import torch_default_data_collator
+from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
+import numpy as np
+from transformers import (
+    AutoConfig,
+    AutoModelForMaskedLM,
+    AutoTokenizer,
+)
+
+from transformers import Trainer as HfTrainer
+
+
+def preprocess_function(raw_example, **kwargs):
+    tokenizer = kwargs['tokenizer']
+    data_args = kwargs['data_args']
+    template = kwargs['template']
+    verbalizer = kwargs['verbalizer']
+    tokenizer_wrapper = kwargs['tokenizer_wrapper']
+
+    example = InputExample(**raw_example)
+    example, other = template.wrap_one_example(example)
+    input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
+    model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
+                        padding="max_length", truncation=True)
+    return model_inputs
+
+def compute_metrics(eval_preds, dataset_name, eval_metric):
+    # from IPython import embed; embed(header="In compute metrics")
+
+    preds, labels = eval_preds.predictions, eval_preds.label_ids
+
+    preds = np.argmax(preds, axis=-1)
+
+    result = {}
+    average_metrics = []
+    for metric in eval_metric:
+        metric_item = metric(preds, labels)
+        metric_value =  list(metric_item.values())
+        result.update(metric_item)
+        average_metrics.extend(metric_value)
+    print("average:",average_metrics)
+    average_metric = sum(average_metrics)/len(average_metrics)
+    result.update({"average_metrics":average_metric})
+    return result
+
+def mask_token_func(tokenizer, ith_mask=0):
+    return tokenizer.mask_token
+
+def get_remove_columns(dataset_features):
+    # from IPython import embed; embed(header="get_remove_columns")
+    dataset_features.remove("label")
+    return dataset_features
+
+
+def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
+    from openpromptu.prompts import ManualVerbalizer
+    from openpromptu.prompts import ManualTemplate
+    from openpromptu import TokenizerWrapper
+    template = ManualTemplate(text = task.templates_text[template_id])
+    verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
+    tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
+    # from IPython import embed; embed()
+    return template, verbalizer, tokenizer_wrapper
+
+class DataCollator(HfDataCollatorMixin):
+    def __init__(self, *args, **kwargs):
+        self.return_tensors='pt'
+
+    def torch_call(self, features):
+        return torch_default_data_collator(features=features)
+
+
+
+def get_backbone(model_args, **kwargs):
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    config.dropout_rate = 0.0
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    model = AutoModelForMaskedLM.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    model.resize_token_embeddings(len(tokenizer))
+    return config, tokenizer, model
+
+class Trainer(HfTrainer):
+    def __init__(self, verbalizer=None, eval_task=None, **kwargs):
+        super().__init__(**kwargs)
+        self.verbalizer=verbalizer
+        self.eval_task=eval_task
+        self.compute_metrics = self._compute_metrics
+
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        labels = inputs.pop('labels')
+        outputs = model(**inputs)
+        logits = outputs.get("logits")
+        input_ids = inputs['input_ids']
+        verbalizer = self.verbalizer.cuda()
+        logits_at_mask = logits[torch.where(input_ids == verbalizer.tokenizer.mask_token_id)]
+        label_logits = verbalizer.process_logits(logits_at_mask)
+        loss_fct = torch.nn.CrossEntropyLoss()
+        loss = loss_fct(label_logits, labels)
+        outputs.logits = label_logits
+        return (loss, outputs) if return_outputs else loss
+
+    def _compute_metrics(self, eval_preds):
+        # from IPython import embed; embed(header="In compute metrics")
+
+        preds, labels = eval_preds.predictions, eval_preds.label_ids
+
+        preds = np.argmax(preds, axis=-1)
+
+        result = {}
+        average_metrics = []
+        for metric in self.eval_task.metric:
+            metric_item = metric(preds, labels)
+            metric_value =  list(metric_item.values())
+            result.update(metric_item)
+            average_metrics.extend(metric_value)
+        print("average:",average_metrics)
+        average_metric = sum(average_metrics)/len(average_metrics)
+        result.update({"average_metrics":average_metric})
+        return result
+
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/backbones/bigbird_.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/bigbird_.py
@ -0,0 +1,169 @@
+from openpromptu.data_utils import InputExample
+import torch
+from transformers.data.data_collator import torch_default_data_collator
+from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
+from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
+import numpy as np
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+)
+
+from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
+import copy
+from torch.nn import CrossEntropyLoss
+
+def preprocess_function(raw_example, **kwargs):
+    tokenizer = kwargs['tokenizer']
+    data_args = kwargs['data_args']
+    template = kwargs['template']
+    verbalizer = kwargs['verbalizer']
+    tokenizer_wrapper = kwargs['tokenizer_wrapper']
+
+    example = InputExample(**raw_example)
+    # example = verbalizer.wrap_one_example(example)
+    example, other = template.wrap_one_example(example)
+    input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
+    model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
+                        padding="max_length", truncation=True)
+    return model_inputs
+    
+
+
+def compute_metrics(eval_preds, dataset_name, eval_metric):
+    pass
+
+def mask_token_func(tokenizer, ith_mask=0):
+    return tokenizer.pad_token
+
+def get_remove_columns(dataset_features):
+    # dataset_features.remove("label")
+    return dataset_features
+
+def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
+    from openpromptu.prompts import GenerationVerbalizer
+    from openpromptu.prompts import ManualTemplate
+    from openpromptu import TokenizerWrapper
+    template = ManualTemplate(text = task.templates_text[template_id])
+    verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None)
+    tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
+    return template, verbalizer, tokenizer_wrapper
+
+
+def get_backbone(model_args, **kwargs):
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    # config.dropout_rate = 0.0
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        )
+    return config, tokenizer, model
+
+class Trainer(HfSeq2SeqTrainer):
+    def __init__(self, verbalizer=None, eval_task=None, **kwargs):
+        super().__init__(**kwargs)
+        self.eval_task = eval_task
+        self.compute_metrics = self._compute_metrics
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+
+        labels=copy.deepcopy(inputs['input_ids'])
+        # labels[labels==self.tokenizer.pad_token_id]=-100
+        outputs = model(**inputs)
+        logits = outputs.logits
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
+        loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1))
+
+        return (loss, outputs) if return_outputs else loss
+
+    def prediction_step(
+        self,
+        model, #nn.Module,
+        inputs, #Dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only, #: bool,
+        ignore_keys, #: Optional[List[str]] = None,
+    ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform an evaluation step on :obj:`model` using obj:`inputs`.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (:obj:`nn.Module`):
+                The model to evaluate.
+            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (:obj:`bool`):
+                Whether or not to return the loss only.
+
+        Return:
+            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
+            labels (each being optional).
+        """
+        if not self.args.predict_with_generate or prediction_loss_only:
+            return super().prediction_step(
+                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
+            )
+
+        inputs = self._prepare_inputs(inputs)
+        with torch.no_grad():
+            labels=copy.deepcopy(inputs['input_ids'])
+            # labels[labels==self.tokenizer.pad_token_id]=-100
+            outputs = model(**inputs)
+            logits = outputs.logits
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous().long()
+            loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu()
+            loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss)
+            
+        if prediction_loss_only:
+            return (loss, None, None)
+        else:
+            # non pad label
+            shift_labels = shift_labels.view(-1).detach().cpu()
+            nonpad_idx = shift_labels!=self.tokenizer.pad_token_id
+            shift_labels = shift_labels[nonpad_idx]
+            # the probability at the corresponding position
+            shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu()
+            target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device)
+            shift_logits = shift_logits.softmax(dim=-1)[target_position]
+
+
+            return (loss, shift_logits, shift_labels)
+
+    def _compute_metrics(self, eval_preds):
+
+        preds, labels = eval_preds
+
+        result = {}
+        for metric in self.eval_task.metric:
+            result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id))
+
+        average_metric = sum(result.values())/len(result)
+        result.update({"average_metrics":average_metric})
+        return result
--- a/OpenDelta-0.3.2/examples/examples_prompt/backbones/blenderbot.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/blenderbot.py
@ -0,0 +1,181 @@
+
+from openpromptu.data_utils import InputExample
+from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
+from transformers import (
+    AutoConfig,
+    BlenderbotForConditionalGeneration,
+    AutoTokenizer,
+)
+from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
+import torch
+
+def mask_token_func(tokenizer, ith_mask=0):
+    return ""
+
+def get_remove_columns(dataset_features):
+    return dataset_features
+
+def preprocess_function(raw_example, **kwargs):
+    # max_target_length += 1
+    tokenizer = kwargs['tokenizer']
+    data_args = kwargs['data_args']
+    template = kwargs['template']
+    verbalizer = kwargs['verbalizer']
+    tokenizer_wrapper = kwargs['tokenizer_wrapper']
+    split = kwargs['split']
+    example = InputExample(**raw_example)
+
+
+   
+    example = verbalizer.wrap_one_example(example)
+    example, other = template.wrap_one_example(example)
+    input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
+    model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
+                        padding="max_length", truncation=True)
+
+
+    with tokenizer.as_target_tokenizer():
+        label = tokenizer(other['tgt_text']).input_ids
+
+    model_inputs["labels"] = label
+    # from IPython import embed; embed()
+    return model_inputs
+
+def get_backbone(model_args, **kwargs):
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    config.dropout_rate = 0.0
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+
+    model = BlenderbotForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        )
+    # from IPython import embed; embed()
+    return config, tokenizer, model
+
+
+def get_prompts(task, tokenizer, data_args, template_id="blenderbot", verbalizer_id="blenderbot"):
+    from openpromptu.prompts import GenerationVerbalizer
+    from openpromptu.prompts import ManualTemplate
+    from openpromptu import TokenizerWrapper
+    template = ManualTemplate(text = task.templates_text[template_id])
+    verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
+    tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
+    return template, verbalizer, tokenizer_wrapper
+
+class Trainer(HfSeq2SeqTrainer):
+    def __init__(self, verbalizer=None, eval_task=None, **kwargs):
+        super().__init__(**kwargs)
+        self.eval_task = eval_task
+        self.compute_metrics = self._compute_metrics
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        # from IPython import embed; embed()
+        outputs = model(**inputs)
+        if return_outputs:
+            return (outputs.loss, outputs)
+        else:
+            return outputs.loss
+
+    def prediction_step(
+        self,
+        model, #nn.Module,
+        inputs, #Dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only, #: bool,
+        ignore_keys, #: Optional[List[str]] = None,
+    ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform an evaluation step on :obj:`model` using obj:`inputs`.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (:obj:`nn.Module`):
+                The model to evaluate.
+            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (:obj:`bool`):
+                Whether or not to return the loss only.
+
+        Return:
+            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
+            labels (each being optional).
+        """
+        if not self.args.predict_with_generate or prediction_loss_only:
+            return super().prediction_step(
+                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
+            )
+
+
+        has_labels = "labels" in inputs
+        inputs = self._prepare_inputs(inputs)
+        gen_kwargs = {
+            "max_length": 10, # self._max_length if s is not None else self.model.config.max_length,
+            "num_beams": 1, #self._num_beams if self._num_beams is not None else self.model.config.num_beams,
+            "min_length": 1  # for blenderbot, generally we set it to be a large number. But in classification, we set it to 1
+        }
+        generated_tokens = self.model.generate(
+            inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            **gen_kwargs,
+        )
+        # in case the batch is shorter than max length, the output should be padded
+        if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
+
+        with torch.no_grad():
+
+            outputs = model(**inputs)
+            if has_labels:
+                if self.label_smoother is not None:
+                    loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
+                else:
+                    loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
+            else:
+                loss = None
+
+        if self.args.prediction_loss_only:
+            return (loss, None, None)
+
+        labels = inputs["labels"]
+        if labels.shape[-1] < gen_kwargs["max_length"]:
+            labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
+
+        # from IPython import embed; embed(header="In seqseqtrainer")
+        return (loss, generated_tokens, labels)
+
+    def _compute_metrics(self, eval_preds):
+        # from IPython import embed; embed(header="In compute metrics")
+        preds, labels = eval_preds
+        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
+        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
+        # post_processor = .get(data_args.dataset_name[0], tokenizer,
+        #                                     data_args.ignore_pad_token_for_loss)
+        # decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info)
+        result = {}
+        for metric in self.eval_task.metric:
+            result.update(metric(decoded_preds, decoded_labels))
+
+        average_metric = sum(result.values())/len(result)
+        result.update({"average_metrics":average_metric})
+        return result
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/backbones/clip.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/clip.py
@ -0,0 +1,172 @@
+from openpromptu.data_utils import InputExample
+import torch
+from transformers.data.data_collator import torch_default_data_collator
+from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
+import numpy as np
+from transformers import (
+    CLIPConfig,
+    CLIPProcessor,
+    CLIPModel,
+)
+from transformers import ViTFeatureExtractor
+from PIL import Image
+from transformers import Trainer as HfTrainer
+import torch.nn as nn
+
+
+
+def get_prompts(task, tokenizer, data_args, template_id="clip", verbalizer_id="clip"):
+    from openpromptu.prompts import GenerationVerbalizer
+    from openpromptu.prompts import ManualTemplate
+    from openpromptu import TokenizerWrapper
+    template = ManualTemplate(text = task.templates_text[template_id])
+    verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
+    tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer.tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
+    return template, verbalizer, tokenizer_wrapper
+
+def mask_token_func(tokenizer, ith_mask=0):
+    return tokenizer.mask_token
+
+def preprocess_function(raw_example, **kwargs):
+    # from IPython import embed; embed(header="Therefa")
+    tokenizer = kwargs['tokenizer']
+
+    # ["a photo of {}" for i in range()]
+    data_args = kwargs['data_args']
+    template = kwargs['template']
+    verbalizer = kwargs['verbalizer']
+    tokenizer_wrapper = kwargs['tokenizer_wrapper']
+
+    example = InputExample(raw_example)
+
+    texts = []
+
+    for candidate_label in range(verbalizer.num_classes):
+        tgt_text = verbalizer.wrap_one_example(label=candidate_label)
+        wrapped_example, other = template.wrap_one_example(example)
+        input_sentence = tokenizer_wrapper.merge_wrapped_example(wrapped_example, tgt_texts=[tgt_text])
+        texts.append(input_sentence)
+
+    # from IPython import embed; embed()/
+
+    image = Image.open(raw_example['image_file_path'])
+
+    model_inputs = tokenizer(images=image,  text=texts, max_length=16, padding="max_length", truncation=True, return_tensors='pt')
+
+    # from IPython import embed; embed()
+    model_inputs["pixel_values"] = model_inputs["pixel_values"].squeeze()
+    model_inputs["label"] = example.label
+    return model_inputs
+
+def compute_metrics(eval_preds, dataset_name, eval_metric):
+    # from IPython import embed; embed(header="In compute metrics")
+
+    preds, labels = eval_preds.predictions, eval_preds.label_ids
+
+    preds = np.argmax(preds, axis=-1)
+
+    result = {}
+    average_metrics = []
+    for metric in eval_metric:
+        metric_item = metric(preds, labels)
+        metric_value =  list(metric_item.values())
+        result.update(metric_item)
+        average_metrics.extend(metric_value)
+    print("average:",average_metrics)
+    average_metric = sum(average_metrics)/len(average_metrics)
+    result.update({"average_metrics":average_metric})
+    return result
+
+
+
+def get_remove_columns(dataset_features):
+    # from IPython import embed; embed(header="in remoev")
+    dataset_features.remove("labels")
+    print("remove_columns: {}".format(dataset_features))
+    return dataset_features
+
+class DataCollator(HfDataCollatorMixin):
+    def __init__(self, *args, **kwargs):
+        self.return_tensors='pt'
+
+    def torch_call(self, features):
+        # from IPython import embed; embed(header="in data collator")
+        a = torch_default_data_collator(features=features)
+        # from IPython import embed; embed(header="in data collator")
+        a["input_ids"] = a["input_ids"][0]
+        a["attention_mask"] = a["attention_mask"][0]
+        return a
+
+
+def get_backbone(model_args, **kwargs):
+    config = CLIPConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    # config.dropout_rate = 0.0
+    tokenizer = CLIPProcessor.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    model = CLIPModel.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    # config.num_labels = model_args.num_classes
+    # old_classifier = model.classifier
+    # model.classifier = nn.Linear(old_classifier.in_features, config.num_labels)
+
+
+    return config, tokenizer, model
+
+class Trainer(HfTrainer):
+    def __init__(self, verbalizer=None, eval_task=None, **kwargs):
+        super().__init__(**kwargs)
+        self.verbalizer=verbalizer
+        self.eval_task=eval_task
+        self.compute_metrics = self._compute_metrics
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        # from IPython import embed; embed()
+        labels = inputs.pop('labels')
+        outputs = model(**inputs)
+        # logits = outputs.get("logits")
+
+
+        logits_per_image = outputs.logits_per_image
+        loss = self.loss_fn(logits_per_image, labels)
+        return (loss, outputs) if return_outputs else loss
+
+    def _compute_metrics(self, eval_preds):
+        # from IPython import embed; embed(header="In compute metrics")
+
+        preds, labels = eval_preds.predictions, eval_preds.label_ids
+
+        preds = np.argmax(preds, axis=-1)
+
+        result = {}
+        average_metrics = []
+        for metric in self.eval_task.metric:
+            metric_item = metric(preds, labels)
+            metric_value =  list(metric_item.values())
+            result.update(metric_item)
+            average_metrics.extend(metric_value)
+        print("average:",average_metrics)
+        average_metric = sum(average_metrics)/len(average_metrics)
+        result.update({"average_metrics":average_metric})
+        from IPython import embed; embed(header="In compute metrics")
+        return result
+
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/backbones/opt.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/opt.py
@ -0,0 +1,171 @@
+from openpromptu.data_utils import InputExample
+import torch
+from transformers.data.data_collator import torch_default_data_collator
+from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
+from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
+import numpy as np
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+)
+
+from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
+import copy
+from torch.nn import CrossEntropyLoss
+
+def preprocess_function(raw_example, **kwargs):
+    tokenizer = kwargs['tokenizer']
+    data_args = kwargs['data_args']
+    template = kwargs['template']
+    verbalizer = kwargs['verbalizer']
+    tokenizer_wrapper = kwargs['tokenizer_wrapper']
+
+    example = InputExample(**raw_example)
+    # example = verbalizer.wrap_one_example(example)
+    example, other = template.wrap_one_example(example)
+    input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
+    model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
+                        padding="max_length", truncation=True)
+    return model_inputs
+    
+
+
+def compute_metrics(eval_preds, dataset_name, eval_metric):
+    pass
+
+def mask_token_func(tokenizer, ith_mask=0):
+    return tokenizer.pad_token
+
+def get_remove_columns(dataset_features):
+    # dataset_features.remove("label")
+    return dataset_features
+
+def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
+    from openpromptu.prompts import GenerationVerbalizer
+    from openpromptu.prompts import ManualTemplate
+    from openpromptu import TokenizerWrapper
+    template = ManualTemplate(text = task.templates_text[template_id])
+    verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None)
+    tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="tail", mask_token_func=mask_token_func)
+    return template, verbalizer, tokenizer_wrapper
+
+
+def get_backbone(model_args, **kwargs):
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    # config.dropout_rate = 0.0
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    if not hasattr(tokenizer,"pad_token") or (hasattr(tokenizer,"pad_token") and tokenizer.pad_token==None):
+        tokenizer.pad_token = tokenizer.eos_token
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        )
+
+    return config, tokenizer, model
+
+class Trainer(HfSeq2SeqTrainer):
+    def __init__(self, verbalizer=None, eval_task=None, **kwargs):
+        super().__init__(**kwargs)
+        self.eval_task = eval_task
+        self.compute_metrics = self._compute_metrics
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+
+        labels=copy.deepcopy(inputs['input_ids'])
+        # labels[labels==self.tokenizer.pad_token_id]=-100
+        outputs = model(**inputs)
+        logits = outputs.logits
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
+        loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1))
+
+        return (loss, outputs) if return_outputs else loss
+
+    def prediction_step(
+        self,
+        model, #nn.Module,
+        inputs, #Dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only, #: bool,
+        ignore_keys, #: Optional[List[str]] = None,
+    ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform an evaluation step on :obj:`model` using obj:`inputs`.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (:obj:`nn.Module`):
+                The model to evaluate.
+            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (:obj:`bool`):
+                Whether or not to return the loss only.
+
+        Return:
+            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
+            labels (each being optional).
+        """
+        if not self.args.predict_with_generate or prediction_loss_only:
+            return super().prediction_step(
+                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
+            )
+
+        inputs = self._prepare_inputs(inputs)
+        with torch.no_grad():
+            labels=copy.deepcopy(inputs['input_ids'])
+            # labels[labels==self.tokenizer.pad_token_id]=-100
+            outputs = model(**inputs)
+            logits = outputs.logits
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous().long()
+            loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu()
+            loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss)
+            
+        if prediction_loss_only:
+            return (loss, None, None)
+        else:
+            # non pad label
+            shift_labels = shift_labels.view(-1).detach().cpu()
+            nonpad_idx = shift_labels!=self.tokenizer.pad_token_id
+            shift_labels = shift_labels[nonpad_idx]
+            # the probability at the corresponding position
+            shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu()
+            target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device)
+            shift_logits = shift_logits.softmax(dim=-1)[target_position]
+
+
+            return (loss, shift_logits, shift_labels)
+
+    def _compute_metrics(self, eval_preds):
+
+        preds, labels = eval_preds
+
+        result = {}
+        for metric in self.eval_task.metric:
+            result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id))
+
+        average_metric = sum(result.values())/len(result)
+        result.update({"average_metrics":average_metric})
+        return result
--- a/OpenDelta-0.3.2/examples/examples_prompt/backbones/t5.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/t5.py
@ -0,0 +1,177 @@
+
+from openpromptu.data_utils import InputExample
+from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
+from transformers import (
+    AutoConfig,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+)
+from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
+import torch
+
+def mask_token_func(tokenizer, ith_mask):
+    return tokenizer.additional_special_tokens[ith_mask]
+
+def get_remove_columns(dataset_features):
+    return dataset_features
+
+def preprocess_function(raw_example, **kwargs):
+    # max_target_length += 1
+    tokenizer = kwargs['tokenizer']
+    data_args = kwargs['data_args']
+    template = kwargs['template']
+    verbalizer = kwargs['verbalizer']
+    tokenizer_wrapper = kwargs['tokenizer_wrapper']
+    split = kwargs['split']
+    example = InputExample(**raw_example)
+
+
+ 
+    example = verbalizer.wrap_one_example(example)
+    example, other = template.wrap_one_example(example)
+    input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
+    model_inputs = tokenizer(input_sentence, max_length=256,
+                        padding="max_length", truncation=True)
+
+
+    with tokenizer.as_target_tokenizer():
+        label = tokenizer(other['tgt_text']).input_ids
+
+    model_inputs["labels"] = label
+    return model_inputs
+
+def get_backbone(model_args, **kwargs):
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    config.dropout_rate = 0.0
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=model_args.use_fast_tokenizer,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+
+    model = AutoModelForSeq2SeqLM.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+        )
+    return config, tokenizer, model
+
+
+def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
+    from openpromptu.prompts import GenerationVerbalizer
+    from openpromptu.prompts import ManualTemplate
+    from openpromptu import TokenizerWrapper
+    template = ManualTemplate(text = task.templates_text[template_id])
+    verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
+    tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
+    return template, verbalizer, tokenizer_wrapper
+
+class Trainer(HfSeq2SeqTrainer):
+    def __init__(self, verbalizer=None, eval_task=None, **kwargs):
+        super().__init__(**kwargs)
+        self.eval_task = eval_task
+        self.compute_metrics = self._compute_metrics
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        outputs = model(**inputs)
+        if return_outputs:
+            return (outputs.loss, outputs)
+        else:
+            return outputs.loss
+
+    def prediction_step(
+        self,
+        model, #nn.Module,
+        inputs, #Dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only, #: bool,
+        ignore_keys, #: Optional[List[str]] = None,
+    ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform an evaluation step on :obj:`model` using obj:`inputs`.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (:obj:`nn.Module`):
+                The model to evaluate.
+            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (:obj:`bool`):
+                Whether or not to return the loss only.
+
+        Return:
+            Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
+            labels (each being optional).
+        """
+        if not self.args.predict_with_generate or prediction_loss_only:
+            return super().prediction_step(
+                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
+            )
+
+
+        has_labels = "labels" in inputs
+        inputs = self._prepare_inputs(inputs)
+        gen_kwargs = {
+            "max_length": 10, # self._max_length if s is not None else self.model.config.max_length,
+            "num_beams": 1 #self._num_beams if self._num_beams is not None else self.model.config.num_beams,
+        }
+        generated_tokens = self.model.generate(
+            inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            **gen_kwargs,
+        )
+        # in case the batch is shorter than max length, the output should be padded
+        if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
+
+        with torch.no_grad():
+
+            outputs = model(**inputs)
+            if has_labels:
+                if self.label_smoother is not None:
+                    loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
+                else:
+                    loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
+            else:
+                loss = None
+
+        if self.args.prediction_loss_only:
+            return (loss, None, None)
+
+        labels = inputs["labels"]
+        if labels.shape[-1] < gen_kwargs["max_length"]:
+            labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
+
+        # from IPython import embed; embed(header="In seqseqtrainer")
+        return (loss, generated_tokens, labels)
+
+    def _compute_metrics(self, eval_preds):
+        # from IPython import embed; embed(header="In compute metrics")
+        preds, labels = eval_preds
+        decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
+        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
+        # post_processor = .get(data_args.dataset_name[0], tokenizer,
+        #                                     data_args.ignore_pad_token_for_loss)
+        # decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info)
+        result = {}
+        for metric in self.eval_task.metric:
+            result.update(metric(decoded_preds, decoded_labels))
+
+        average_metric = sum(result.values())/len(result)
+        result.update({"average_metrics":average_metric})
+        return result
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/backbones/vit.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/backbones/vit.py
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/adapter_clip-vit-base-patch32/beans.json
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/adapter_clip-vit-base-patch32/beans.json
@ -0,0 +1,48 @@
+{
+    "bottleneck_dim": 24,
+    "dataset_config_name": [
+        "en"
+    ],
+    "delta_type": "adapter",
+    "do_eval": true,
+    "do_test": true,
+    "do_train": true,
+    "eval_dataset_config_name": [
+        "en"
+    ],
+    "eval_dataset_name": "beans",
+    "eval_steps": 200,
+    "evaluation_strategy": "steps",
+    "greater_is_better": true,
+    "learning_rate": 0.0003,
+    "load_best_model_at_end": true,
+    "max_source_length": 256,
+    "metric_for_best_model": "average_metrics",
+    "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32",
+    "num_classes": 3,
+    "num_train_epochs": 20,
+    "output_dir": "outputs/adapter/clip-vit-base-patch32/beans",
+    "overwrite_output_dir": true,
+    "per_device_eval_batch_size": 32,
+    "per_device_train_batch_size": 32,
+    "predict_with_generate": true,
+    "push_to_delta_center": true,
+    "push_to_hub": false,
+    "save_steps": 200,
+    "save_strategy": "steps",
+    "save_total_limit": 1,
+    "seed": 42,
+    "split_validation_test": true,
+    "task_name": "beans",
+    "test_dataset_config_name": [
+        "en"
+    ],
+    "test_dataset_name": "beans",
+    "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32",
+    "unfrozen_modules": [
+        "deltas",
+        "layer_norm",
+        "final_layer_norm"
+    ],
+    "warmup_steps": 0
+}
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/adapter_opt-350m/wikitext.json
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/adapter_opt-350m/wikitext.json
@ -0,0 +1,53 @@
+{
+    "backbone_model": "opt",
+    "bottleneck_dim": 24,
+    "dataset_config_name": [
+        "en"
+    ],
+    "datasets_load_from_disk": true,
+    "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
+    "delta_type": "adapter",
+    "do_eval": true,
+    "do_test": true,
+    "do_train": true,
+    "eval_dataset_config_name": [
+        "en"
+    ],
+    "eval_dataset_name": "wikitext",
+    "eval_steps": 200,
+    "evaluation_strategy": "steps",
+    "gradient_accumulation_steps":2,
+    "greater_is_better": false,
+    "learning_rate": 0.0003,
+    "load_best_model_at_end": true,
+    "max_source_length": 900,
+    "metric_for_best_model": "average_metrics",
+    "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m",
+    "model_path_public": "opt-350m",
+    "num_train_epochs": 3,
+    "output_dir": "outputs/adapter/opt-350m/wikitext",
+    "overwrite_output_dir": true,
+    "per_device_eval_batch_size": 6,
+    "per_device_train_batch_size": 6,
+    "predict_with_generate": true,
+    "push_to_dc": true,
+    "push_to_hf": false,
+    "save_steps": 200,
+    "save_strategy": "steps",
+    "save_total_limit": 1,
+    "seed": 42,
+    "split_validation_test": true,
+    "task_name": "wikitext",
+    "test_dataset_config_name": [
+        "en"
+    ],
+    "test_dataset_name": "wikitext",
+    "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m",
+    "unfrozen_modules": [
+        "deltas",
+        "layer_norm",
+        "final_layer_norm"
+    ],
+    "warmup_steps": 0,
+    "modified_modules":["self_attn"]
+}
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/adapter_vit-large-patch16-224-in21k/beans.json
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/adapter_vit-large-patch16-224-in21k/beans.json
@ -0,0 +1,53 @@
+{
+    "backbone_model": "vit",
+    "bottleneck_dim": 24,
+    "dataset_config_name": [
+        "en"
+    ],
+    "datasets_load_from_disk": false,
+    "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
+    "delta_type": "adapter",
+    "do_eval": true,
+    "do_test": true,
+    "do_train": true,
+    "eval_dataset_config_name": [
+        "en"
+    ],
+    "eval_dataset_name": "beans",
+    "eval_steps": 200,
+    "evaluation_strategy": "steps",
+    "greater_is_better": true,
+    "learning_rate": 0.0003,
+    "load_best_model_at_end": true,
+    "max_source_length": 256,
+    "metric_for_best_model": "average_metrics",
+    "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k",
+    "model_path_public": "vit-large-patch16-224-in21k",
+    "num_classes": 3,
+    "num_train_epochs": 20,
+    "output_dir": "outputs/adapter/vit-large-patch16-224-in21k/beans",
+    "overwrite_output_dir": true,
+    "per_device_eval_batch_size": 32,
+    "per_device_train_batch_size": 32,
+    "predict_with_generate": false,
+    "push_to_dc": true,
+    "push_to_hf": false,
+    "save_steps": 200,
+    "save_strategy": "steps",
+    "save_total_limit": 1,
+    "seed": 42,
+    "split_validation_test": true,
+    "task_name": "beans",
+    "test_dataset_config_name": [
+        "en"
+    ],
+    "test_dataset_name": "beans",
+    "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k",
+    "unfrozen_modules": [
+        "deltas",
+        "layer_norm",
+        "final_layer_norm"
+    ],
+    "warmup_steps": 0,
+    "modified_modules":["output"]
+}
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/bitfit_t5-large/rte.json
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/bitfit_t5-large/rte.json
@ -0,0 +1,51 @@
+{
+    "backbone_model": "t5-large",
+    "dataset_config_name": [
+        "en"
+    ],
+    "datasets_load_from_disk": true,
+    "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
+    "delta_type": "bitfit",
+    "do_eval": true,
+    "do_test": true,
+    "do_train": true,
+    "eval_dataset_config_name": [
+        "en"
+    ],
+    "eval_dataset_name": "rte",
+    "eval_steps": 100,
+    "evaluation_strategy": "steps",
+    "greater_is_better": true,
+    "learning_rate": 0.0003,
+    "load_best_model_at_end": true,
+    "max_source_length": 256,
+    "metric_for_best_model": "average_metrics",
+    "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/t5-large",
+    "model_path_public": "t5-large",
+    "num_train_epochs": 20,
+    "output_dir": "outputs/bitfit/t5-large/rte",
+    "overwrite_output_dir": true,
+    "per_device_eval_batch_size": 16,
+    "per_device_train_batch_size": 16,
+    "predict_with_generate": true,
+    "push_to_dc": true,
+    "push_to_hf": false,
+    "save_steps": 100,
+    "save_strategy": "steps",
+    "save_total_limit": 1,
+    "seed": 42,
+    "split_validation_test": true,
+    "task_name": "rte",
+    "test_dataset_config_name": [
+        "en"
+    ],
+    "test_dataset_name": "rte",
+    "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/t5-large",
+    "unfrozen_modules": [
+        "deltas",
+        "layer_norm",
+        "final_layer_norm"
+    ],
+    "warmup_steps": 0,
+    "modified_modules":["attn", "ff", "layer_norm"]
+}
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/compacter_blenderbot-3b/sst2.json
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/compacter_blenderbot-3b/sst2.json
@ -0,0 +1,66 @@
+{
+    "backbone_model": "blenderbot",
+    "dataset_config_name": [
+        "en"
+    ],
+    "datasets_load_from_disk": true,
+    "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
+    "delta_type": "compacter",
+    "do_eval": true,
+    "do_test": true,
+    "do_train": true,
+    "eval_dataset_config_name": [
+        "en"
+    ],
+    "eval_dataset_name": "sst2",
+    "eval_steps": 200,
+    "evaluation_strategy": "steps",
+    "factorized_phm": true,
+    "factorized_phm_rule": false,
+    "gradient_clip": false,
+    "greater_is_better": true,
+    "hypercomplex_adapters": true,
+    "hypercomplex_division": 4,
+    "hypercomplex_nonlinearity": "glorot-uniform",
+    "learn_phm": true,
+    "learning_rate": 0.003,
+    "load_best_model_at_end": true,
+    "max_source_length": 128,
+    "metric_for_best_model": "average_metrics",
+    "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b",
+    "model_path_public": "blenderbot-3b",
+    "non_linearity": "gelu_new",
+    "normalize_phm_weight": false,
+    "num_train_epochs": 3,
+    "output_dir": "outputs/compacter/blenderbot-3b/sst2",
+    "overwrite_output_dir": true,
+    "per_device_eval_batch_size": 32,
+    "per_device_train_batch_size": 32,
+    "phm_c_init": "normal",
+    "phm_clamp": false,
+    "phm_init_range": 0.0001,
+    "predict_with_generate": true,
+    "push_to_dc": true,
+    "push_to_hf": false,
+    "save_steps": 200,
+    "save_strategy": "steps",
+    "save_total_limit": 1,
+    "seed": 42,
+    "shared_phm_rule": false,
+    "split_validation_test": true,
+    "task_name": "sst2",
+    "test_dataset_config_name": [
+        "en"
+    ],
+    "test_dataset_name": "sst2",
+    "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b",
+    "unfrozen_modules": [
+        "deltas",
+        "layer_norm",
+        "final_layer_norm"
+    ],
+    "use_bias_down_sampler": true,
+    "use_bias_up_sampler": true,
+    "warmup_steps": 0,
+    "modified_modules":["fc2"]
+}
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/compacter_deberta-v2-xlarge/mnli.json
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/compacter_deberta-v2-xlarge/mnli.json
@ -0,0 +1,51 @@
+{
+    "backbone_model": "deberta-v2-xlarge",
+    "dataset_config_name": [
+        "en"
+    ],
+    "datasets_load_from_disk": true,
+    "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
+    "delta_type": "compacter",
+    "do_eval": true,
+    "do_test": true,
+    "do_train": true,
+    "eval_dataset_config_name": [
+        "en"
+    ],
+    "eval_dataset_name": "mnli",
+    "eval_steps": 500,
+    "evaluation_strategy": "steps",
+    "greater_is_better": true,
+    "is_seq2seq": false,
+    "learning_rate": 0.0003,
+    "load_best_model_at_end": true,
+    "max_source_length": 256,
+    "metric_for_best_model": "average_metrics",
+    "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge",
+    "num_train_epochs": 3,
+    "output_dir": "outputs/compacter/deberta-v2-xlarge/mnli",
+    "overwrite_output_dir": true,
+    "per_device_eval_batch_size": 32,
+    "per_device_train_batch_size": 32,
+    "predict_with_generate": false,
+    "push_to_dc": true,
+    "push_to_hub": false,
+    "save_steps": 500,
+    "save_strategy": "steps",
+    "save_total_limit": 1,
+    "seed": 42,
+    "split_validation_test": true,
+    "task_name": "mnli",
+    "test_dataset_config_name": [
+        "en"
+    ],
+    "test_dataset_name": "mnli",
+    "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge",
+    "unfrozen_modules": [
+        "deltas",
+        "layer_norm",
+        "final_layer_norm"
+    ],
+    "warmup_steps": 0,
+    "modified_modules":["attention"]
+}
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/compacter_long-t5-tglobal-large/rte.json
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/compacter_long-t5-tglobal-large/rte.json
@ -0,0 +1,51 @@
+{
+    "backbone_model": "long-t5",
+    "dataset_config_name": [
+        "en"
+    ],
+    "datasets_load_from_disk": true,
+    "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
+    "delta_type": "compacter",
+    "do_eval": true,
+    "do_test": true,
+    "do_train": true,
+    "eval_dataset_config_name": [
+        "en"
+    ],
+    "eval_dataset_name": "rte",
+    "eval_steps": 100,
+    "evaluation_strategy": "steps",
+    "greater_is_better": true,
+    "learning_rate": 0.0003,
+    "load_best_model_at_end": true,
+    "max_source_length": 256,
+    "metric_for_best_model": "average_metrics",
+    "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large",
+    "model_path_public": "long-t5-tglobal-large",
+    "num_train_epochs": 20,
+    "output_dir": "outputs/compacter/long-t5-tglobal-large/rte",
+    "overwrite_output_dir": true,
+    "per_device_eval_batch_size": 16,
+    "per_device_train_batch_size": 16,
+    "predict_with_generate": true,
+    "push_to_dc": true,
+    "push_to_hf": false,
+    "save_steps": 100,
+    "save_strategy": "steps",
+    "save_total_limit": 1,
+    "seed": 42,
+    "split_validation_test": true,
+    "task_name": "rte",
+    "test_dataset_config_name": [
+        "en"
+    ],
+    "test_dataset_name": "rte",
+    "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large",
+    "unfrozen_modules": [
+        "deltas",
+        "layer_norm",
+        "final_layer_norm"
+    ],
+    "warmup_steps": 0,
+    "modified_modules":["attn", "ff", "layer_norm"]
+}
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen.py
@ -0,0 +1,51 @@
+import collections
+import copy
+
+PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
+PATHBASE="/home/hushengding/plm_cache/"
+
+AllConfigs = {}
+
+import argparse
+import json
+import os
+parser = argparse.ArgumentParser("Parser to generate configuration")
+parser.add_argument("--job", type=str)
+parser.add_argument("--")
+args = parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    config = AllConfigs[args.job]
+
+    Cartesian_product = []
+    for key in config:
+        if isinstance(key, tuple):
+            Cartesian_product.append(key)
+    all_config_jsons = {}
+    for key_tuple in Cartesian_product:
+        for zipped in config[key_tuple]:
+            job_name = zipped[0]
+            all_config_jsons[job_name] = {}
+            for key_name, zipped_elem in zip(key_tuple, zipped):
+                if key_name != 'job_name':
+                    all_config_jsons[job_name][key_name] = zipped_elem
+    for key in config:
+        if not isinstance(key, tuple):
+            for job_name in all_config_jsons:
+                if key == "output_dir":
+                    all_config_jsons[job_name][key] = config[key] + job_name
+                else:
+                    all_config_jsons[job_name][key] = config[key]
+
+
+    if not os.path.exists(f"configs/{args.job}/"):
+        os.mkdir(f"configs/{args.job}/")
+
+    for job_name in all_config_jsons:
+        with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
+            json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
+
+
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_albert.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_albert.py
@ -0,0 +1,116 @@
+import collections
+import copy
+
+PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
+PATHBASE="/home/hushengding/plm_cache/"
+
+AllConfigs = {}
+
+BaseConfigs = {}
+
+
+#### ROBERTA######
+BaseConfigs['albert-xlarge-v2'] = {
+                ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
+                "max_source_length",
+                "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
+                    "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
+                    [ 20,  20,  40,  20,   3,   3,  20,  20,  20,   3,   3,  20,   3,   3,  20],
+                    [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [0] *7 +[0] *8,
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                ),
+                "do_train": True,
+                "do_eval": True,
+                "do_test": True,
+
+                "model_name_or_path": f"{PATHBASE}albert-xlarge-v2",
+                "tokenizer_name": f"{PATHBASE}albert-xlarge-v2",
+                "save_total_limit": 1,
+                # For glue datasets.
+                "is_seq2seq": False,
+                "split_validation_test": True,
+                "seed": 42,
+                "dataset_config_name": ["en"],
+                "eval_dataset_config_name": ["en"],
+                "test_dataset_config_name": ["en"],
+                # other configurations.
+                "predict_with_generate": False,
+                # To evaluate during training.
+                "load_best_model_at_end": True,
+                "metric_for_best_model": "average_metrics",
+                "greater_is_better": True,
+                "evaluation_strategy": "steps",
+                "overwrite_output_dir": True,
+                "push_to_hub": False,
+                "push_to_delta_center": True,
+                "save_strategy": "steps"
+            }
+
+AllConfigs['prefix_albert-xlarge-v2'] = copy.deepcopy(BaseConfigs['albert-xlarge-v2'])
+AllConfigs['prefix_albert-xlarge-v2'].update({
+                                "delta_type": "prefix",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/prefix/albert-xlarge-v2/",
+                            })
+
+AllConfigs['soft_prompt_albert-xlarge-v2'] = copy.deepcopy(BaseConfigs['albert-xlarge-v2'])
+AllConfigs['soft_prompt_albert-xlarge-v2'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/albert-xlarge-v2/",
+                            })
+
+if __name__ == "__main__":
+    import argparse
+    import json
+    import os
+    parser = argparse.ArgumentParser("Parser to generate configuration")
+    parser.add_argument("--job", type=str)
+    args = parser.parse_args()
+
+    config = AllConfigs[args.job]
+
+    Cartesian_product = []
+    for key in config:
+        if isinstance(key, tuple):
+            Cartesian_product.append(key)
+    all_config_jsons = {}
+    for key_tuple in Cartesian_product:
+        for zipped in config[key_tuple]:
+            job_name = zipped[0]
+            all_config_jsons[job_name] = {}
+            for key_name, zipped_elem in zip(key_tuple, zipped):
+                if key_name != 'job_name':
+                    all_config_jsons[job_name][key_name] = zipped_elem
+    for key in config:
+        if not isinstance(key, tuple):
+            for job_name in all_config_jsons:
+                if key == "output_dir":
+                    all_config_jsons[job_name][key] = config[key] + job_name
+                else:
+                    all_config_jsons[job_name][key] = config[key]
+
+
+    if not os.path.exists(f"configs/{args.job}/"):
+        os.mkdir(f"configs/{args.job}/")
+
+    for job_name in all_config_jsons:
+        with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
+            json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
+
+
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_bart.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_bart.py
@ -0,0 +1,261 @@
+import collections
+import copy
+
+PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
+PATHBASE="/home/hushengding/plm_cache/"
+# PATHBASE=""
+
+AllConfigs = {}
+
+BaseConfigs = {}
+BaseConfigs['bart-base'] = {
+                ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
+                "max_source_length",
+                "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
+                    "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
+                    [ 20,  20,  40,  20,   3,   3,  20,  20,  20,   3,   3,  20,   3,   3,  20],
+                    [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [0] *7 +[0] *8,
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                ),
+                "do_train": True,
+                "do_eval": True,
+                "do_test": True,
+
+                "model_name_or_path": f"{PATHBASE}bart-base",
+                "tokenizer_name": f"{PATHBASE}bart-base",
+                "save_total_limit": 1,
+                # For glue datasets.
+                "split_validation_test": True,
+                "seed": 42,
+                "dataset_config_name": ["en"],
+                "eval_dataset_config_name": ["en"],
+                "test_dataset_config_name": ["en"],
+                # other configurations.
+                "predict_with_generate": True,
+                # To evaluate during training.
+                "load_best_model_at_end": True,
+                "metric_for_best_model": "average_metrics",
+                "greater_is_better": True,
+                "evaluation_strategy": "steps",
+                "overwrite_output_dir": True,
+                "push_to_hub": False,
+                "push_to_delta_center": True,
+                "save_strategy": "steps"
+            }
+
+AllConfigs['bitfit_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
+AllConfigs['bitfit_bart-base'].update({
+                "delta_type": "bitfit",
+                "learning_rate": 3e-4,
+                "output_dir": "outputs/bitfit/bart-base/",
+            })
+
+AllConfigs['adapter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
+AllConfigs['adapter_bart-base'].update({
+                                "delta_type": "adapter",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "bottleneck_dim":24,
+                                "output_dir": "outputs/adapter/bart-base/",
+                            })
+
+AllConfigs['parallel_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['parallel_adapter_t5-base'].update({
+                                "delta_type": "parallel_adapter",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "bottleneck_dim":24,
+                                "output_dir": "outputs/parallel_adapter/t5-base/",
+                            })
+
+AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['lora_t5-base'].update({
+                                "delta_type": "lora",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "modified_modules": [
+                                    "q_proj",
+                                    "v_proj",
+                                ],
+                                "lora_r": 8,
+                                "output_dir": "outputs/lora/bart-base/",
+                            })
+
+AllConfigs['compacter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
+AllConfigs['compacter_bart-base'].update({
+                                "delta_type": "compacter",
+                                "learning_rate": 3e-3,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/compacter/bart-base/",
+                                "non_linearity": "gelu_new",
+
+                                #Compacter.
+                                "hypercomplex_division": 4,
+                                "hypercomplex_adapters": True,
+                                "hypercomplex_nonlinearity": "glorot-uniform",
+                                # gradient clip and clamp
+                                "gradient_clip": False,
+                                "phm_clamp": False,
+                                "normalize_phm_weight": False,
+                                "learn_phm": True,
+                                # shared one side
+                                "factorized_phm": True,
+                                "shared_phm_rule": False,
+                                "factorized_phm_rule": False,
+                                "phm_c_init": "normal",
+                                "phm_init_range": 0.0001,
+                                "use_bias_down_sampler": True,
+                                "use_bias_up_sampler": True,
+                            })
+
+AllConfigs['compacter++_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
+AllConfigs['compacter++_bart-base'].update({
+                                "delta_type": "compacter",
+                                "learning_rate": 3e-3,
+                                "do_train": True,
+                                "do_eval": True,
+                                "do_test": True,
+                                "modified_modules": [
+                                    "DenseReluDense"
+                                ],
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/compacter++/bart-base/",
+                                "non_linearity": "gelu_new",
+
+                                #Compacter.
+                                "hypercomplex_division": 4,
+                                "hypercomplex_adapters": True,
+                                "hypercomplex_nonlinearity": "glorot-uniform",
+                                # gradient clip and clamp
+                                "gradient_clip": False,
+                                "phm_clamp": False,
+                                "normalize_phm_weight": False,
+                                "learn_phm": True,
+                                # shared one side
+                                "factorized_phm": True,
+                                "shared_phm_rule": False,
+                                "factorized_phm_rule": False,
+                                "phm_c_init": "normal",
+                                "phm_init_range": 0.0001,
+                                "use_bias_down_sampler": True,
+                                "use_bias_up_sampler": True,
+                            })
+
+
+AllConfigs['low_rank_adapter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
+AllConfigs['low_rank_adapter_bart-base'].update({
+                                "delta_type": "low_rank_adapter",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/low_rank_adapter/bart-base/",
+                                "non_linearity": "gelu_new",
+                                "low_rank_w_init": "glorot-uniform",
+                                "low_rank_rank": 1,
+                            })
+
+
+AllConfigs['soft_prompt_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
+AllConfigs['soft_prompt_bart-base'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-2,
+                                "soft_token_num":100,
+                                "token_init": False,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/bart-base/",
+                            })
+
+AllConfigs['prefix_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
+AllConfigs['prefix_bart-base'].update({
+                                "delta_type": "prefix",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/prefix/bart-base/",
+                            })
+
+AllConfigs['soft_prompt_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
+AllConfigs['soft_prompt_bart-base'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/bart-base/",
+                            })
+
+
+if __name__ == "__main__":
+    import argparse
+    import json
+    import os
+    parser = argparse.ArgumentParser("Parser to generate configuration")
+    parser.add_argument("--job", type=str)
+    args = parser.parse_args()
+
+    config = AllConfigs[args.job]
+
+    Cartesian_product = []
+    for key in config:
+        if isinstance(key, tuple):
+            Cartesian_product.append(key)
+    all_config_jsons = {}
+    for key_tuple in Cartesian_product:
+        for zipped in config[key_tuple]:
+            job_name = zipped[0]
+            all_config_jsons[job_name] = {}
+            for key_name, zipped_elem in zip(key_tuple, zipped):
+                if key_name != 'job_name':
+                    all_config_jsons[job_name][key_name] = zipped_elem
+    for key in config:
+        if not isinstance(key, tuple):
+            for job_name in all_config_jsons:
+                if key == "output_dir":
+                    all_config_jsons[job_name][key] = config[key] + job_name
+                else:
+                    all_config_jsons[job_name][key] = config[key]
+
+
+    if not os.path.exists(f"configs/{args.job}/"):
+        os.mkdir(f"configs/{args.job}/")
+
+    for job_name in all_config_jsons:
+        with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
+            json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
+
+
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_beit.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_beit.py
@ -0,0 +1,250 @@
+import collections
+import copy
+
+PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
+PATHBASE="/home/hushengding/plm_cache/"
+
+AllConfigs = {}
+
+BaseConfigs = {}
+BaseConfigs['beit-base-patch16-224'] = {
+                ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
+                "max_source_length",
+                "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "num_classes"): zip(
+                    ["beans"],
+                    ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
+                    [ 20],
+                    [256],
+                    [ 32],
+                    [ 32],#,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [0], # *7 +[0] *8,
+                    [200],# 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [200],#, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [ 3],
+                ),
+                "do_train": True,
+                "do_eval": True,
+                "do_test": True,
+
+                "model_name_or_path": f"{PATHBASE}beit-base-patch16-224",
+                "tokenizer_name": f"{PATHBASE}beit-base-patch16-224",
+                "save_total_limit": 1,
+                # For glue datasets.
+                "split_validation_test": True,
+                "seed": 42,
+                "dataset_config_name": ["en"],
+                "eval_dataset_config_name": ["en"],
+                "test_dataset_config_name": ["en"],
+                # other configurations.
+                "predict_with_generate": False,
+                # To evaluate during training.
+                "load_best_model_at_end": True,
+                "metric_for_best_model": "average_metrics",
+                "greater_is_better": True,
+                "evaluation_strategy": "steps",
+
+                "overwrite_output_dir": True,
+                "push_to_hub": False,
+                "push_to_delta_center": True,
+                "save_strategy": "steps",
+                "datasets_load_from_disk":False,
+            }
+
+AllConfigs['bitfit_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
+AllConfigs['bitfit_beit-base-patch16-224'].update({
+                "delta_type": "bitfit",
+                "learning_rate": 3e-4,
+                "output_dir": "outputs/bitfit/beit-base-patch16-224/",
+            })
+
+AllConfigs['adapter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
+AllConfigs['adapter_beit-base-patch16-224'].update({
+                                "delta_type": "adapter",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "bottleneck_dim":24,
+                                "output_dir": "outputs/adapter/beit-base-patch16-224/",
+                            })
+
+AllConfigs['lora_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
+AllConfigs['lora_beit-base-patch16-224'].update({
+                                "delta_type": "lora",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layernorm_after",
+                                    "classifier"
+                                ],
+                                "modified_modules":[
+                                    "query",
+                                    "value",
+                                ],
+                                "lora_r": 8,
+                                "output_dir": "outputs/lora/beit-base-patch16-224/",
+                            })
+
+AllConfigs['compacter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
+AllConfigs['compacter_beit-base-patch16-224'].update({
+                                "delta_type": "compacter",
+                                "learning_rate": 3e-3,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/compacter/beit-base-patch16-224/",
+                                "non_linearity": "gelu_new",
+
+                                #Compacter.
+                                "hypercomplex_division": 4,
+                                "hypercomplex_adapters": True,
+                                "hypercomplex_nonlinearity": "glorot-uniform",
+                                # gradient clip and clamp
+                                "gradient_clip": False,
+                                "phm_clamp": False,
+                                "normalize_phm_weight": False,
+                                "learn_phm": True,
+                                # shared one side
+                                "factorized_phm": True,
+                                "shared_phm_rule": False,
+                                "factorized_phm_rule": False,
+                                "phm_c_init": "normal",
+                                "phm_init_range": 0.0001,
+                                "use_bias_down_sampler": True,
+                                "use_bias_up_sampler": True,
+                            })
+
+AllConfigs['compacter++_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
+AllConfigs['compacter++_beit-base-patch16-224'].update({
+                                "delta_type": "compacter",
+                                "learning_rate": 3e-3,
+                                "do_train": True,
+                                "do_eval": True,
+                                "do_test": True,
+                                "modified_modules": [
+                                    "DenseReluDense"
+                                ],
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/compacter++/beit-base-patch16-224/",
+                                "non_linearity": "gelu_new",
+
+                                #Compacter.
+                                "hypercomplex_division": 4,
+                                "hypercomplex_adapters": True,
+                                "hypercomplex_nonlinearity": "glorot-uniform",
+                                # gradient clip and clamp
+                                "gradient_clip": False,
+                                "phm_clamp": False,
+                                "normalize_phm_weight": False,
+                                "learn_phm": True,
+                                # shared one side
+                                "factorized_phm": True,
+                                "shared_phm_rule": False,
+                                "factorized_phm_rule": False,
+                                "phm_c_init": "normal",
+                                "phm_init_range": 0.0001,
+                                "use_bias_down_sampler": True,
+                                "use_bias_up_sampler": True,
+                            })
+
+
+AllConfigs['low_rank_adapter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
+AllConfigs['low_rank_adapter_beit-base-patch16-224'].update({
+                                "delta_type": "low_rank_adapter",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/low_rank_adapter/beit-base-patch16-224/",
+                                "non_linearity": "gelu_new",
+                                "low_rank_w_init": "glorot-uniform",
+                                "low_rank_rank": 1,
+                            })
+
+
+AllConfigs['soft_prompt_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
+AllConfigs['soft_prompt_beit-base-patch16-224'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-2,
+                                "soft_token_num":100,
+                                "token_init": False,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/beit-base-patch16-224/",
+                            })
+
+AllConfigs['prefix_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
+AllConfigs['prefix_beit-base-patch16-224'].update({
+                                "delta_type": "prefix",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/prefix/beit-base-patch16-224/",
+                            })
+
+AllConfigs['soft_prompt_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
+AllConfigs['soft_prompt_beit-base-patch16-224'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/beit-base-patch16-224/",
+                            })
+
+
+
+if __name__ == "__main__":
+    import argparse
+    import json
+    import os
+    parser = argparse.ArgumentParser("Parser to generate configuration")
+    parser.add_argument("--job", type=str)
+    args = parser.parse_args()
+
+    config = AllConfigs[args.job]
+
+    Cartesian_product = []
+    for key in config:
+        if isinstance(key, tuple):
+            Cartesian_product.append(key)
+    all_config_jsons = {}
+    for key_tuple in Cartesian_product:
+        for zipped in config[key_tuple]:
+            job_name = zipped[0]
+            all_config_jsons[job_name] = {}
+            for key_name, zipped_elem in zip(key_tuple, zipped):
+                if key_name != 'job_name':
+                    all_config_jsons[job_name][key_name] = zipped_elem
+    for key in config:
+        if not isinstance(key, tuple):
+            for job_name in all_config_jsons:
+                if key == "output_dir":
+                    all_config_jsons[job_name][key] = config[key] + job_name
+                else:
+                    all_config_jsons[job_name][key] = config[key]
+
+
+    if not os.path.exists(f"configs/{args.job}/"):
+        os.mkdir(f"configs/{args.job}/")
+
+    for job_name in all_config_jsons:
+        with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
+            json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
+
+
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_bert.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_bert.py
@ -0,0 +1,125 @@
+import collections
+import copy
+
+PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
+# PATHBASE="/home/hushengding/plm_cache/"
+
+AllConfigs = {}
+
+BaseConfigs = {}
+
+
+#### ROBERTA######
+BaseConfigs['bert-base-cased'] = {
+                ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
+                "max_source_length",
+                "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
+                    "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
+                    [ 20,  20,  40,  20,   3,   3,  20,  20,  20,   3,   3,  20,   3,   3,  20],
+                    [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [0] *7 +[0] *8,
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                ),
+                "do_train": True,
+                "do_eval": True,
+                "do_test": True,
+
+                "model_name_or_path": f"{PATHBASE}bert-base-cased",
+                "tokenizer_name": f"{PATHBASE}bert-base-cased",
+                "save_total_limit": 1,
+                # For glue datasets.
+                "is_seq2seq": False,
+                "split_validation_test": True,
+                "seed": 42,
+                "dataset_config_name": ["en"],
+                "eval_dataset_config_name": ["en"],
+                "test_dataset_config_name": ["en"],
+                # other configurations.
+                "predict_with_generate": False,
+                # To evaluate during training.
+                "load_best_model_at_end": True,
+                "metric_for_best_model": "average_metrics",
+                "greater_is_better": True,
+                "evaluation_strategy": "steps",
+                "overwrite_output_dir": True,
+                "push_to_hub": False,
+                "push_to_delta_center": True,
+                "save_strategy": "steps",
+                "datasets_load_from_disk": True,
+                "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/"
+            }
+
+AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
+AllConfigs['prefix_bert-base-cased'].update({
+                                "delta_type": "prefix",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/prefix/bert-base-cased/",
+                            })
+
+AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
+AllConfigs['soft_prompt_bert-base-cased'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/bert-base-cased/",
+                            })
+
+AllConfigs['prefix_bert-large-cased'] = copy.deepcopy(AllConfigs['prefix_bert-base-cased'])
+AllConfigs['prefix_bert-large-cased'].update({
+    "output_dir": "outputs/prefix/bert-large-cased/",
+    "model_name_or_path": f"{PATHBASE}bert-large-cased",
+    "tokenizer_name": f"{PATHBASE}bert-large-cased",
+})
+
+if __name__ == "__main__":
+    import argparse
+    import json
+    import os
+    parser = argparse.ArgumentParser("Parser to generate configuration")
+    parser.add_argument("--job", type=str)
+    args = parser.parse_args()
+
+    config = AllConfigs[args.job]
+
+    Cartesian_product = []
+    for key in config:
+        if isinstance(key, tuple):
+            Cartesian_product.append(key)
+    all_config_jsons = {}
+    for key_tuple in Cartesian_product:
+        for zipped in config[key_tuple]:
+            job_name = zipped[0]
+            all_config_jsons[job_name] = {}
+            for key_name, zipped_elem in zip(key_tuple, zipped):
+                if key_name != 'job_name':
+                    all_config_jsons[job_name][key_name] = zipped_elem
+    for key in config:
+        if not isinstance(key, tuple):
+            for job_name in all_config_jsons:
+                if key == "output_dir":
+                    all_config_jsons[job_name][key] = config[key] + job_name
+                else:
+                    all_config_jsons[job_name][key] = config[key]
+
+
+    if not os.path.exists(f"configs/{args.job}/"):
+        os.mkdir(f"configs/{args.job}/")
+
+    for job_name in all_config_jsons:
+        with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
+            json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
+
+
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_bigbird.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_bigbird.py
@ -0,0 +1,147 @@
+import collections
+import copy
+
+PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
+PATHBASE="/home/hushengding/plm_cache/"
+
+AllConfigs = {}
+
+BaseConfigs = {}
+
+#### ROBERTA ######
+BaseConfigs['bigbird-roberta-large'] = {
+                ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
+                "max_source_length",
+                "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
+                    "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
+                    [ 20,  20,  40,  20,   3,   3,  20,  20,  20,   3,   3,  20,   3,   3,  20],
+                    [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [0] *7 +[0] *8,
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                ),
+                "do_train": True,
+                "do_eval": True,
+                "do_test": True,
+
+                "model_name_or_path": f"{PATHBASE}bigbird-roberta-large",
+                "tokenizer_name": f"{PATHBASE}bigbird-roberta-large",
+                "save_total_limit": 1,
+                # For glue datasets.
+                "is_seq2seq": False,
+                "split_validation_test": True,
+                "seed": 42,
+                "dataset_config_name": ["en"],
+                "eval_dataset_config_name": ["en"],
+                "test_dataset_config_name": ["en"],
+                # other configurations.
+                "predict_with_generate": False,
+                # To evaluate during training.
+                "load_best_model_at_end": True,
+                "metric_for_best_model": "average_metrics",
+                "greater_is_better": True,
+                "evaluation_strategy": "steps",
+                "overwrite_output_dir": True,
+                "push_to_hub": True,
+                "save_strategy": "steps"
+            }
+
+
+
+AllConfigs['bitfit_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
+AllConfigs['bitfit_bigbird-roberta-large'].update({
+                "delta_type": "bitfit",
+                "learning_rate": 1e-3,
+                "output_dir": "outputs/bitfit/bigbird-roberta-large/",
+            })
+
+AllConfigs['none_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
+AllConfigs['none_bigbird-roberta-large'].update({
+                "delta_type": "none",
+                "learning_rate": 1e-5,
+                "output_dir": "outputs/none/bigbird-roberta-large/",
+            })
+
+
+AllConfigs['lora_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
+AllConfigs['lora_bigbird-roberta-large'].update({
+                "delta_type": "lora",
+                "learning_rate": 1e-3,
+                "modified_modules": [
+                    "query",
+                    "key",
+                ],
+                "output_dir": "outputs/lora/bigbird-roberta-large/",
+            })
+
+AllConfigs['adapter_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
+AllConfigs['adapter_bigbird-roberta-large'].update({
+                "delta_type": "adapter",
+                "learning_rate": 1e-3,
+                "output_dir": "outputs/adapter/bigbird-roberta-large/",
+            })
+
+AllConfigs['low_rank_adapter_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
+AllConfigs['low_rank_adapter_bigbird-roberta-large'].update({
+                "delta_type": "low_rank_adapter",
+                "learning_rate": 1e-3,
+                "output_dir": "outputs/low_rank_adapter/bigbird-roberta-large/",
+            })
+
+
+AllConfigs['soft_prompt_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
+AllConfigs['soft_prompt_bigbird-roberta-large'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/bigbird-roberta-large/",
+                            })
+
+if __name__ == "__main__":
+    import argparse
+    import json
+    import os
+    parser = argparse.ArgumentParser("Parser to generate configuration")
+    parser.add_argument("--job", type=str)
+    args = parser.parse_args()
+
+    config = AllConfigs[args.job]
+
+    Cartesian_product = []
+    for key in config:
+        if isinstance(key, tuple):
+            Cartesian_product.append(key)
+    all_config_jsons = {}
+    for key_tuple in Cartesian_product:
+        for zipped in config[key_tuple]:
+            job_name = zipped[0]
+            all_config_jsons[job_name] = {}
+            for key_name, zipped_elem in zip(key_tuple, zipped):
+                if key_name != 'job_name':
+                    all_config_jsons[job_name][key_name] = zipped_elem
+    for key in config:
+        if not isinstance(key, tuple):
+            for job_name in all_config_jsons:
+                if key == "output_dir":
+                    all_config_jsons[job_name][key] = config[key] + job_name
+                else:
+                    all_config_jsons[job_name][key] = config[key]
+
+
+    if not os.path.exists(f"configs/{args.job}/"):
+        os.mkdir(f"configs/{args.job}/")
+
+    for job_name in all_config_jsons:
+        with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
+            json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
+
+
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_blenderbot.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_blenderbot.py
@ -0,0 +1,254 @@
+import collections
+import copy
+
+PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
+PATHBASE="/home/hushengding/plm_cache/"
+
+AllConfigs = {}
+
+BaseConfigs = {}
+BaseConfigs['blenderbot-400M-distill'] = {
+                ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
+                "max_source_length",
+                "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
+                    "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
+                    [ 20,  20,  40,  20,   3,   3,  20,  20,  20,   3,   3,  20,   3,   3,  20],
+                    [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128],
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [0] *7 +[0] *8,
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                ),
+                "do_train": True,
+                "do_eval": True,
+                "do_test": True,
+
+                "model_name_or_path": f"{PATHBASE}blenderbot-400M-distill",
+                "tokenizer_name": f"{PATHBASE}blenderbot-400M-distill",
+                "save_total_limit": 1,
+                # For glue datasets.
+                "split_validation_test": True,
+                "seed": 42,
+                "dataset_config_name": ["en"],
+                "eval_dataset_config_name": ["en"],
+                "test_dataset_config_name": ["en"],
+                # other configurations.
+                "predict_with_generate": True,
+                # To evaluate during training.
+                "load_best_model_at_end": True,
+                "metric_for_best_model": "average_metrics",
+                "greater_is_better": True,
+                "evaluation_strategy": "steps",
+                "overwrite_output_dir": True,
+                "push_to_hub": False,
+                "push_to_delta_center": True,
+                "save_strategy": "steps"
+            }
+
+AllConfigs['bitfit_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
+AllConfigs['bitfit_blenderbot-400M-distill'].update({
+                "delta_type": "bitfit",
+                "learning_rate": 3e-4,
+                "output_dir": "outputs/bitfit/blenderbot-400M-distill/",
+            })
+
+AllConfigs['adapter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
+AllConfigs['adapter_blenderbot-400M-distill'].update({
+                                "delta_type": "adapter",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "bottleneck_dim":24,
+                                "output_dir": "outputs/adapter/blenderbot-400M-distill/",
+                            })
+
+AllConfigs['lora_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
+AllConfigs['lora_blenderbot-400M-distill'].update({
+                                "delta_type": "lora",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "modified_modules":[
+                                    "q_proj",
+                                    "v_proj",
+                                ],
+                                "lora_r": 8,
+                                "output_dir": "outputs/lora/blenderbot-400M-distill/",
+                            })
+
+AllConfigs['compacter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
+AllConfigs['compacter_blenderbot-400M-distill'].update({
+                                "delta_type": "compacter",
+                                "learning_rate": 3e-3,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/compacter/blenderbot-400M-distill/",
+                                "non_linearity": "gelu_new",
+
+                                #Compacter.
+                                "hypercomplex_division": 4,
+                                "hypercomplex_adapters": True,
+                                "hypercomplex_nonlinearity": "glorot-uniform",
+                                # gradient clip and clamp
+                                "gradient_clip": False,
+                                "phm_clamp": False,
+                                "normalize_phm_weight": False,
+                                "learn_phm": True,
+                                # shared one side
+                                "factorized_phm": True,
+                                "shared_phm_rule": False,
+                                "factorized_phm_rule": False,
+                                "phm_c_init": "normal",
+                                "phm_init_range": 0.0001,
+                                "use_bias_down_sampler": True,
+                                "use_bias_up_sampler": True,
+                            })
+
+AllConfigs['compacter++_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
+AllConfigs['compacter++_blenderbot-400M-distill'].update({
+                                "delta_type": "compacter",
+                                "learning_rate": 3e-3,
+                                "do_train": True,
+                                "do_eval": True,
+                                "do_test": True,
+                                "modified_modules": [
+                                    "DenseReluDense"
+                                ],
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/compacter++/blenderbot-400M-distill/",
+                                "non_linearity": "gelu_new",
+
+                                #Compacter.
+                                "hypercomplex_division": 4,
+                                "hypercomplex_adapters": True,
+                                "hypercomplex_nonlinearity": "glorot-uniform",
+                                # gradient clip and clamp
+                                "gradient_clip": False,
+                                "phm_clamp": False,
+                                "normalize_phm_weight": False,
+                                "learn_phm": True,
+                                # shared one side
+                                "factorized_phm": True,
+                                "shared_phm_rule": False,
+                                "factorized_phm_rule": False,
+                                "phm_c_init": "normal",
+                                "phm_init_range": 0.0001,
+                                "use_bias_down_sampler": True,
+                                "use_bias_up_sampler": True,
+                            })
+
+
+AllConfigs['low_rank_adapter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
+AllConfigs['low_rank_adapter_blenderbot-400M-distill'].update({
+                                "delta_type": "low_rank_adapter",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/low_rank_adapter/blenderbot-400M-distill/",
+                                "non_linearity": "gelu_new",
+                                "low_rank_w_init": "glorot-uniform",
+                                "low_rank_rank": 1,
+                            })
+
+AllConfigs['none_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
+AllConfigs['none_blenderbot-400M-distill'].update({
+                "delta_type": "none",
+                "learning_rate": 1e-5,
+                "output_dir": "outputs/none/blenderbot-400M-distill/",
+            })
+
+
+AllConfigs['soft_prompt_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
+AllConfigs['soft_prompt_blenderbot-400M-distill'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-2,
+                                "soft_token_num":100,
+                                "token_init": False,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/blenderbot-400M-distill/",
+                            })
+
+AllConfigs['prefix_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
+AllConfigs['prefix_blenderbot-400M-distill'].update({
+                                "delta_type": "prefix",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/prefix/blenderbot-400M-distill/",
+                            })
+
+AllConfigs['soft_prompt_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
+AllConfigs['soft_prompt_blenderbot-400M-distill'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/blenderbot-400M-distill/",
+                            })
+
+
+if __name__ == "__main__":
+    import argparse
+    import json
+    import os
+    parser = argparse.ArgumentParser("Parser to generate configuration")
+    parser.add_argument("--job", type=str)
+    args = parser.parse_args()
+
+    config = AllConfigs[args.job]
+
+    Cartesian_product = []
+    for key in config:
+        if isinstance(key, tuple):
+            Cartesian_product.append(key)
+    all_config_jsons = {}
+    for key_tuple in Cartesian_product:
+        for zipped in config[key_tuple]:
+            job_name = zipped[0]
+            all_config_jsons[job_name] = {}
+            for key_name, zipped_elem in zip(key_tuple, zipped):
+                if key_name != 'job_name':
+                    all_config_jsons[job_name][key_name] = zipped_elem
+    for key in config:
+        if not isinstance(key, tuple):
+            for job_name in all_config_jsons:
+                if key == "output_dir":
+                    all_config_jsons[job_name][key] = config[key] + job_name
+                else:
+                    all_config_jsons[job_name][key] = config[key]
+
+
+    if not os.path.exists(f"configs/{args.job}/"):
+        os.mkdir(f"configs/{args.job}/")
+
+    for job_name in all_config_jsons:
+        with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
+            json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
+
+
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_clip.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_clip.py
@ -0,0 +1,303 @@
+import collections
+import copy
+
+PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
+# PATHBASE="/home/hushengding/plm_cache/"
+
+AllConfigs = {}
+
+BaseConfigs = {}
+BaseConfigs['clip-vit-base-patch32'] = {
+                ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
+                "max_source_length",
+                "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "num_classes"): zip(
+                    ["beans"],
+                    ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
+                    [ 20],
+                    [256],
+                    [ 32],
+                    [ 32],#,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [0], # *7 +[0] *8,
+                    [200],# 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [200],#, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [ 3],
+                ),
+                "do_train": True,
+                "do_eval": True,
+                "do_test": True,
+
+                "model_name_or_path": f"{PATHBASE}clip-vit-base-patch32",
+                "tokenizer_name": f"{PATHBASE}clip-vit-base-patch32",
+                "save_total_limit": 1,
+                # For glue datasets.
+                "split_validation_test": True,
+                "seed": 42,
+                "dataset_config_name": ["en"],
+                "eval_dataset_config_name": ["en"],
+                "test_dataset_config_name": ["en"],
+                # other configurations.
+                "predict_with_generate": True,
+                # To evaluate during training.
+                "load_best_model_at_end": True,
+                "metric_for_best_model": "average_metrics",
+                "greater_is_better": True,
+                "evaluation_strategy": "steps",
+                "overwrite_output_dir": True,
+                "push_to_hub": False,
+                "push_to_delta_center": True,
+                "save_strategy": "steps"
+            }
+
+AllConfigs['bitfit_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
+AllConfigs['bitfit_clip-vit-base-patch32'].update({
+                "delta_type": "bitfit",
+                "learning_rate": 3e-4,
+                "output_dir": "outputs/bitfit/clip-vit-base-patch32/",
+            })
+
+AllConfigs['none_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
+AllConfigs['none_clip-vit-base-patch32'].update({
+                "delta_type": "none",
+                "learning_rate": 1e-5,
+                "output_dir": "outputs/none/clip-vit-base-patch32/",
+            })
+
+AllConfigs['adapter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
+AllConfigs['adapter_clip-vit-base-patch32'].update({
+                                "delta_type": "adapter",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "bottleneck_dim":24,
+                                "output_dir": "outputs/adapter/clip-vit-base-patch32/",
+                            })
+
+AllConfigs['lora_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
+AllConfigs['lora_clip-vit-base-patch32'].update({
+                                "delta_type": "lora",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "lora_r": 8,
+                                "output_dir": "outputs/lora/clip-vit-base-patch32/",
+                            })
+
+AllConfigs['compacter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
+AllConfigs['compacter_clip-vit-base-patch32'].update({
+                                "delta_type": "compacter",
+                                "learning_rate": 3e-3,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/compacter/clip-vit-base-patch32/",
+                                "non_linearity": "gelu_new",
+
+                                #Compacter.
+                                "hypercomplex_division": 4,
+                                "hypercomplex_adapters": True,
+                                "hypercomplex_nonlinearity": "glorot-uniform",
+                                # gradient clip and clamp
+                                "gradient_clip": False,
+                                "phm_clamp": False,
+                                "normalize_phm_weight": False,
+                                "learn_phm": True,
+                                # shared one side
+                                "factorized_phm": True,
+                                "shared_phm_rule": False,
+                                "factorized_phm_rule": False,
+                                "phm_c_init": "normal",
+                                "phm_init_range": 0.0001,
+                                "use_bias_down_sampler": True,
+                                "use_bias_up_sampler": True,
+                            })
+
+AllConfigs['compacter++_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
+AllConfigs['compacter++_clip-vit-base-patch32'].update({
+                                "delta_type": "compacter",
+                                "learning_rate": 3e-3,
+                                "do_train": True,
+                                "do_eval": True,
+                                "do_test": True,
+                                "modified_modules": [
+                                    "DenseReluDense"
+                                ],
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/compacter++/clip-vit-base-patch32/",
+                                "non_linearity": "gelu_new",
+
+                                #Compacter.
+                                "hypercomplex_division": 4,
+                                "hypercomplex_adapters": True,
+                                "hypercomplex_nonlinearity": "glorot-uniform",
+                                # gradient clip and clamp
+                                "gradient_clip": False,
+                                "phm_clamp": False,
+                                "normalize_phm_weight": False,
+                                "learn_phm": True,
+                                # shared one side
+                                "factorized_phm": True,
+                                "shared_phm_rule": False,
+                                "factorized_phm_rule": False,
+                                "phm_c_init": "normal",
+                                "phm_init_range": 0.0001,
+                                "use_bias_down_sampler": True,
+                                "use_bias_up_sampler": True,
+                            })
+
+
+AllConfigs['low_rank_adapter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
+AllConfigs['low_rank_adapter_clip-vit-base-patch32'].update({
+                                "delta_type": "low_rank_adapter",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/low_rank_adapter/clip-vit-base-patch32/",
+                                "non_linearity": "gelu_new",
+                                "low_rank_w_init": "glorot-uniform",
+                                "low_rank_rank": 1,
+                            })
+
+
+AllConfigs['soft_prompt_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
+AllConfigs['soft_prompt_clip-vit-base-patch32'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-2,
+                                "soft_token_num":100,
+                                "token_init": False,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/clip-vit-base-patch32/",
+                            })
+
+AllConfigs['prefix_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
+AllConfigs['prefix_clip-vit-base-patch32'].update({
+                                "delta_type": "prefix",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/prefix/clip-vit-base-patch32/",
+                            })
+
+AllConfigs['soft_prompt_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
+AllConfigs['soft_prompt_clip-vit-base-patch32'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/clip-vit-base-patch32/",
+                            })
+#### clip-vit-base-patch32
+BaseConfigs['t5-small'] = {
+                ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
+                "max_source_length",
+                "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
+                    "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
+                    [ 20,  20,  40,  20,   3,   3,  20,  20,  20,   3,   3,  20,   3,   3,  20],
+                    [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [0] *7 +[0] *8,
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                ),
+                "do_train": True,
+                "do_eval": True,
+                "do_test": True,
+
+                "model_name_or_path": f"{PATHBASE}t5-small",
+                "tokenizer_name": f"{PATHBASE}t5-small",
+                "save_total_limit": 1,
+                # For glue datasets.
+                "split_validation_test": True,
+                "seed": 42,
+                "dataset_config_name": ["en"],
+                "eval_dataset_config_name": ["en"],
+                "test_dataset_config_name": ["en"],
+                # other configurations.
+                "predict_with_generate": True,
+                # To evaluate during training.
+                "load_best_model_at_end": True,
+                "metric_for_best_model": "average_metrics",
+                "greater_is_better": True,
+                "evaluation_strategy": "steps",
+                "overwrite_output_dir": True,
+                "push_to_hub": False,
+                "push_to_delta_center": True,
+                "save_strategy": "steps"
+            }
+
+AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
+AllConfigs['prefix_t5-small'].update({
+                                "delta_type": "prefix",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/prefix/t5-small/",
+                            })
+
+
+if __name__ == "__main__":
+    import argparse
+    import json
+    import os
+    parser = argparse.ArgumentParser("Parser to generate configuration")
+    parser.add_argument("--job", type=str)
+    args = parser.parse_args()
+
+    config = AllConfigs[args.job]
+
+    Cartesian_product = []
+    for key in config:
+        if isinstance(key, tuple):
+            Cartesian_product.append(key)
+    all_config_jsons = {}
+    for key_tuple in Cartesian_product:
+        for zipped in config[key_tuple]:
+            job_name = zipped[0]
+            all_config_jsons[job_name] = {}
+            for key_name, zipped_elem in zip(key_tuple, zipped):
+                if key_name != 'job_name':
+                    all_config_jsons[job_name][key_name] = zipped_elem
+    for key in config:
+        if not isinstance(key, tuple):
+            for job_name in all_config_jsons:
+                if key == "output_dir":
+                    all_config_jsons[job_name][key] = config[key] + job_name
+                else:
+                    all_config_jsons[job_name][key] = config[key]
+
+
+    if not os.path.exists(f"configs/{args.job}/"):
+        os.mkdir(f"configs/{args.job}/")
+
+    for job_name in all_config_jsons:
+        with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
+            json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
+
+
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_gpt.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_gpt.py
@ -0,0 +1,433 @@
+import collections
+import copy
+
+PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
+PATHBASE="/home/hushengding/plm_cache/"
+
+AllConfigs = {}
+
+BaseConfigs = {}
+BaseConfigs['t5-base'] = {
+                ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
+                "max_source_length",
+                "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
+                    "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
+                    [ 20,  20,  40,  20,   3,   3,  20,  20,  20,   3,   3,  20,   3,   3,  20],
+                    [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [0] *7 +[0] *8,
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                ),
+                "do_train": True,
+                "do_eval": True,
+                "do_test": True,
+
+                "model_name_or_path": f"{PATHBASE}t5-base",
+                "tokenizer_name": f"{PATHBASE}t5-base",
+                "save_total_limit": 1,
+                # For glue datasets.
+                "split_validation_test": True,
+                "seed": 42,
+                "dataset_config_name": ["en"],
+                "eval_dataset_config_name": ["en"],
+                "test_dataset_config_name": ["en"],
+                # other configurations.
+                "predict_with_generate": True,
+                # To evaluate during training.
+                "load_best_model_at_end": True,
+                "metric_for_best_model": "average_metrics",
+                "greater_is_better": True,
+                "evaluation_strategy": "steps",
+                "overwrite_output_dir": True,
+                "push_to_hub": True,
+                "save_strategy": "steps"
+            }
+
+AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['bitfit_t5-base'].update({
+                "delta_type": "bitfit",
+                "learning_rate": 3e-4,
+                "output_dir": "outputs/bitfit/t5-base/",
+            })
+
+AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['adapter_t5-base'].update({
+                                "delta_type": "adapter",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "bottleneck_dim":24,
+                                "output_dir": "outputs/adapter/t5-base/",
+                            })
+
+AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['lora_t5-base'].update({
+                                "delta_type": "lora",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "lora_r": 8,
+                                "output_dir": "outputs/lora/t5-base/",
+                            })
+
+AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['compacter_t5-base'].update({
+                                "delta_type": "compacter",
+                                "learning_rate": 3e-3,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/compacter/t5-base/",
+                                "non_linearity": "gelu_new",
+
+                                #Compacter.
+                                "hypercomplex_division": 4,
+                                "hypercomplex_adapters": True,
+                                "hypercomplex_nonlinearity": "glorot-uniform",
+                                # gradient clip and clamp
+                                "gradient_clip": False,
+                                "phm_clamp": False,
+                                "normalize_phm_weight": False,
+                                "learn_phm": True,
+                                # shared one side
+                                "factorized_phm": True,
+                                "shared_phm_rule": False,
+                                "factorized_phm_rule": False,
+                                "phm_c_init": "normal",
+                                "phm_init_range": 0.0001,
+                                "use_bias_down_sampler": True,
+                                "use_bias_up_sampler": True,
+                            })
+
+AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['compacter++_t5-base'].update({
+                                "delta_type": "compacter",
+                                "learning_rate": 3e-3,
+                                "do_train": True,
+                                "do_eval": True,
+                                "do_test": True,
+                                "modified_modules": [
+                                    "DenseReluDense"
+                                ],
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/compacter++/t5-base/",
+                                "non_linearity": "gelu_new",
+
+                                #Compacter.
+                                "hypercomplex_division": 4,
+                                "hypercomplex_adapters": True,
+                                "hypercomplex_nonlinearity": "glorot-uniform",
+                                # gradient clip and clamp
+                                "gradient_clip": False,
+                                "phm_clamp": False,
+                                "normalize_phm_weight": False,
+                                "learn_phm": True,
+                                # shared one side
+                                "factorized_phm": True,
+                                "shared_phm_rule": False,
+                                "factorized_phm_rule": False,
+                                "phm_c_init": "normal",
+                                "phm_init_range": 0.0001,
+                                "use_bias_down_sampler": True,
+                                "use_bias_up_sampler": True,
+                            })
+
+
+AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['low_rank_adapter_t5-base'].update({
+                                "delta_type": "low_rank_adapter",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/low_rank_adapter/t5-base/",
+                                "non_linearity": "gelu_new",
+                                "low_rank_w_init": "glorot-uniform",
+                                "low_rank_rank": 1,
+                            })
+
+
+AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['soft_prompt_t5-base'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-2,
+                                "soft_token_num":100,
+                                "token_init": False,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/t5-base/",
+                            })
+
+AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['prefix_t5-base'].update({
+                                "delta_type": "prefix",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/prefix/t5-base/",
+                            })
+
+#### T5-base
+BaseConfigs['t5-small'] = {
+                ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
+                "max_source_length",
+                "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
+                    "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
+                    [ 20,  20,  40,  20,   3,   3,  20,  20,  20,   3,   3,  20,   3,   3,  20],
+                    [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [0] *7 +[0] *8,
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                ),
+                "do_train": True,
+                "do_eval": True,
+                "do_test": True,
+
+                "model_name_or_path": f"{PATHBASE}t5-small",
+                "tokenizer_name": f"{PATHBASE}t5-small",
+                "save_total_limit": 1,
+                # For glue datasets.
+                "split_validation_test": True,
+                "seed": 42,
+                "dataset_config_name": ["en"],
+                "eval_dataset_config_name": ["en"],
+                "test_dataset_config_name": ["en"],
+                # other configurations.
+                "predict_with_generate": True,
+                # To evaluate during training.
+                "load_best_model_at_end": True,
+                "metric_for_best_model": "average_metrics",
+                "greater_is_better": True,
+                "evaluation_strategy": "steps",
+                "overwrite_output_dir": True,
+                "push_to_hub": True,
+                "save_strategy": "steps"
+            }
+
+AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
+AllConfigs['prefix_t5-small'].update({
+                                "delta_type": "prefix",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/prefix/t5-small/",
+                            })
+
+
+
+
+#### ROBERTA######
+BaseConfigs['roberta-base'] = {
+                ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
+                "max_source_length",
+                "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
+                    "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
+                    [ 20,  20,  40,  20,   3,   3,  20,  20,  20,   3,   3,  20,   3,   3,  20],
+                    [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [0] *7 +[0] *8,
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                ),
+                "do_train": True,
+                "do_eval": True,
+                "do_test": True,
+
+                "model_name_or_path": f"{PATHBASE}roberta-base",
+                "tokenizer_name": f"{PATHBASE}roberta-base",
+                "save_total_limit": 1,
+                # For glue datasets.
+                "is_seq2seq": False,
+                "split_validation_test": True,
+                "seed": 42,
+                "dataset_config_name": ["en"],
+                "eval_dataset_config_name": ["en"],
+                "test_dataset_config_name": ["en"],
+                # other configurations.
+                "predict_with_generate": False,
+                # To evaluate during training.
+                "load_best_model_at_end": True,
+                "metric_for_best_model": "average_metrics",
+                "greater_is_better": True,
+                "evaluation_strategy": "steps",
+                "overwrite_output_dir": True,
+                "push_to_hub": True,
+                "save_strategy": "steps"
+            }
+
+
+
+AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
+AllConfigs['bitfit_roberta-base'].update({
+                "delta_type": "bitfit",
+                "learning_rate": 1e-3,
+                "output_dir": "outputs/bitfit/roberta-base/",
+            })
+
+AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
+AllConfigs['none_roberta-base'].update({
+                "delta_type": "none",
+                "learning_rate": 1e-5,
+                "output_dir": "outputs/none/roberta-base/",
+            })
+
+
+AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
+AllConfigs['lora_roberta-base'].update({
+                "delta_type": "lora",
+                "learning_rate": 1e-3,
+                "output_dir": "outputs/lora/roberta-base/",
+            })
+
+AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
+AllConfigs['adapter_roberta-base'].update({
+                "delta_type": "adapter",
+                "learning_rate": 1e-3,
+                "output_dir": "outputs/adapter/roberta-base/",
+            })
+
+AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
+AllConfigs['low_rank_adapter_roberta-base'].update({
+                "delta_type": "low_rank_adapter",
+                "learning_rate": 1e-3,
+                "output_dir": "outputs/low_rank_adapter/roberta-base/",
+            })
+
+#### ROBERTA######
+BaseConfigs['bert-base-cased'] = {
+                ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
+                "max_source_length",
+                "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
+                    "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
+                    [ 20,  20,  40,  20,   3,   3,  20,  20,  20,   3,   3,  20,   3,   3,  20],
+                    [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [0] *7 +[0] *8,
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                ),
+                "do_train": True,
+                "do_eval": True,
+                "do_test": True,
+
+                "model_name_or_path": f"{PATHBASE}bert-base-cased",
+                "tokenizer_name": f"{PATHBASE}bert-base-cased",
+                "save_total_limit": 1,
+                # For glue datasets.
+                "is_seq2seq": False,
+                "split_validation_test": True,
+                "seed": 42,
+                "dataset_config_name": ["en"],
+                "eval_dataset_config_name": ["en"],
+                "test_dataset_config_name": ["en"],
+                # other configurations.
+                "predict_with_generate": False,
+                # To evaluate during training.
+                "load_best_model_at_end": True,
+                "metric_for_best_model": "average_metrics",
+                "greater_is_better": True,
+                "evaluation_strategy": "steps",
+                "overwrite_output_dir": True,
+                "push_to_hub": True,
+                "save_strategy": "steps"
+            }
+
+AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
+AllConfigs['prefix_bert-base-cased'].update({
+                                "delta_type": "prefix",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/prefix/bert-base-cased/",
+                            })
+
+AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
+AllConfigs['soft_prompt_bert-base-cased'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/bert-base-cased/",
+                            })
+
+if __name__ == "__main__":
+    import argparse
+    import json
+    import os
+    parser = argparse.ArgumentParser("Parser to generate configuration")
+    parser.add_argument("--job", type=str)
+    args = parser.parse_args()
+
+    config = AllConfigs[args.job]
+
+    Cartesian_product = []
+    for key in config:
+        if isinstance(key, tuple):
+            Cartesian_product.append(key)
+    all_config_jsons = {}
+    for key_tuple in Cartesian_product:
+        for zipped in config[key_tuple]:
+            job_name = zipped[0]
+            all_config_jsons[job_name] = {}
+            for key_name, zipped_elem in zip(key_tuple, zipped):
+                if key_name != 'job_name':
+                    all_config_jsons[job_name][key_name] = zipped_elem
+    for key in config:
+        if not isinstance(key, tuple):
+            for job_name in all_config_jsons:
+                if key == "output_dir":
+                    all_config_jsons[job_name][key] = config[key] + job_name
+                else:
+                    all_config_jsons[job_name][key] = config[key]
+
+
+    if not os.path.exists(f"configs/{args.job}/"):
+        os.mkdir(f"configs/{args.job}/")
+
+    for job_name in all_config_jsons:
+        with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
+            json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
+
+
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_roberta.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_roberta.py
@ -0,0 +1,163 @@
+import collections
+import copy
+
+PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
+# PATHBASE="/home/hushengding/plm_cache/"
+
+AllConfigs = {}
+
+BaseConfigs = {}
+
+#### ROBERTA######
+BaseConfigs['roberta-base'] = {
+                ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
+                "max_source_length",
+                "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
+                    "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
+                    [ 20,  20,  40,  20,   3,   3,  20,  20,  20,   3,   3,  20,   3,   3,  20],
+                    [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [0] *7 +[0] *8,
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                ),
+                "do_train": True,
+                "do_eval": True,
+                "do_test": True,
+
+                "model_name_or_path": f"{PATHBASE}roberta-base",
+                "tokenizer_name": f"{PATHBASE}roberta-base",
+                "save_total_limit": 1,
+                # For glue datasets.
+                "is_seq2seq": False,
+                "split_validation_test": True,
+                "seed": 42,
+                "dataset_config_name": ["en"],
+                "eval_dataset_config_name": ["en"],
+                "test_dataset_config_name": ["en"],
+                # other configurations.
+                "predict_with_generate": False,
+                # To evaluate during training.
+                "load_best_model_at_end": True,
+                "metric_for_best_model": "average_metrics",
+                "greater_is_better": True,
+                "evaluation_strategy": "steps",
+                "overwrite_output_dir": True,
+                "push_to_hub": True,
+                "save_strategy": "steps",
+                "datasets_load_from_disk": True,
+                "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/"
+            }
+
+
+
+AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
+AllConfigs['bitfit_roberta-base'].update({
+                "delta_type": "bitfit",
+                "learning_rate": 1e-3,
+                "output_dir": "outputs/bitfit/roberta-base/",
+            })
+
+AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
+AllConfigs['none_roberta-base'].update({
+                "delta_type": "none",
+                "learning_rate": 1e-5,
+                "output_dir": "outputs/none/roberta-base/",
+            })
+
+
+AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
+AllConfigs['lora_roberta-base'].update({
+                "delta_type": "lora",
+                "learning_rate": 1e-3,
+                "output_dir": "outputs/lora/roberta-base/",
+            })
+
+AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
+AllConfigs['adapter_roberta-base'].update({
+                "delta_type": "adapter",
+                "learning_rate": 1e-3,
+                "output_dir": "outputs/adapter/roberta-base/",
+            })
+
+AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
+AllConfigs['low_rank_adapter_roberta-base'].update({
+                "delta_type": "low_rank_adapter",
+                "learning_rate": 1e-3,
+                "output_dir": "outputs/low_rank_adapter/roberta-base/",
+            })
+
+
+AllConfigs['soft_prompt_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
+AllConfigs['soft_prompt_roberta-base'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/roberta-base/",
+                            })
+
+AllConfigs['prefix_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
+AllConfigs['prefix_roberta-base'].update({
+                                "delta_type": "prefix",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/prefix/roberta-base/",
+                            })
+
+
+AllConfigs['prefix_roberta-large'] = copy.deepcopy(AllConfigs['prefix_roberta-base'])
+AllConfigs['prefix_roberta-large'].update({
+    "output_dir": "outputs/prefix/prefix_roberta-large",
+    "model_name_or_path": f"{PATHBASE}prefix_roberta-large",
+    "tokenizer_name": f"{PATHBASE}prefix_roberta-large",
+})
+
+if __name__ == "__main__":
+    import argparse
+    import json
+    import os
+    parser = argparse.ArgumentParser("Parser to generate configuration")
+    parser.add_argument("--job", type=str)
+    args = parser.parse_args()
+
+    config = AllConfigs[args.job]
+
+    Cartesian_product = []
+    for key in config:
+        if isinstance(key, tuple):
+            Cartesian_product.append(key)
+    all_config_jsons = {}
+    for key_tuple in Cartesian_product:
+        for zipped in config[key_tuple]:
+            job_name = zipped[0]
+            all_config_jsons[job_name] = {}
+            for key_name, zipped_elem in zip(key_tuple, zipped):
+                if key_name != 'job_name':
+                    all_config_jsons[job_name][key_name] = zipped_elem
+    for key in config:
+        if not isinstance(key, tuple):
+            for job_name in all_config_jsons:
+                if key == "output_dir":
+                    all_config_jsons[job_name][key] = config[key] + job_name
+                else:
+                    all_config_jsons[job_name][key] = config[key]
+
+
+    if not os.path.exists(f"configs/{args.job}/"):
+        os.mkdir(f"configs/{args.job}/")
+
+    for job_name in all_config_jsons:
+        with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
+            json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
+
+
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_t5.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/gen_t5.py
@ -0,0 +1,300 @@
+import collections
+import copy
+
+PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
+# PATHBASE="/home/hushengding/plm_cache/"
+
+AllConfigs = {}
+
+BaseConfigs = {}
+BaseConfigs['t5-base'] = {
+                ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
+                "max_source_length",
+                "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
+                    "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
+                    [ 20,  20,  40,  20,   3,   3,  20,  20,  20,   3,   3,  20,   3,   3,  20],
+                    [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [0] *7 +[0] *8,
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                ),
+                "do_train": True,
+                "do_eval": True,
+                "do_test": True,
+
+                "model_name_or_path": f"{PATHBASE}t5-base",
+                "tokenizer_name": f"{PATHBASE}t5-base",
+                "save_total_limit": 1,
+                # For glue datasets.
+                "split_validation_test": True,
+                "seed": 42,
+                "dataset_config_name": ["en"],
+                "eval_dataset_config_name": ["en"],
+                "test_dataset_config_name": ["en"],
+                # other configurations.
+                "predict_with_generate": True,
+                # To evaluate during training.
+                "load_best_model_at_end": True,
+                "metric_for_best_model": "average_metrics",
+                "greater_is_better": True,
+                "evaluation_strategy": "steps",
+                "overwrite_output_dir": True,
+                "push_to_hf": False,
+                "push_to_dc": True,
+                "save_strategy": "steps",
+                "datasets_load_from_disk": True,
+                "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
+                "backbone_model": "t5", # use in delta center,
+                "model_path_public": "t5-base", # use in delta center,
+
+            }
+
+AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['bitfit_t5-base'].update({
+                "delta_type": "bitfit",
+                "learning_rate": 3e-4,
+                "output_dir": "outputs/bitfit/t5-base/",
+            })
+
+AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['adapter_t5-base'].update({
+                                "delta_type": "adapter",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "bottleneck_dim":24,
+                                "output_dir": "outputs/adapter/t5-base/",
+                            })
+
+AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['lora_t5-base'].update({
+                                "delta_type": "lora",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "lora_r": 8,
+                                "output_dir": "outputs/lora/t5-base/",
+                            })
+
+AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['compacter_t5-base'].update({
+                                "delta_type": "compacter",
+                                "learning_rate": 3e-3,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/compacter/t5-base/",
+                                "non_linearity": "gelu_new",
+
+                                #Compacter.
+                                "hypercomplex_division": 4,
+                                "hypercomplex_adapters": True,
+                                "hypercomplex_nonlinearity": "glorot-uniform",
+                                # gradient clip and clamp
+                                "gradient_clip": False,
+                                "phm_clamp": False,
+                                "normalize_phm_weight": False,
+                                "learn_phm": True,
+                                # shared one side
+                                "factorized_phm": True,
+                                "shared_phm_rule": False,
+                                "factorized_phm_rule": False,
+                                "phm_c_init": "normal",
+                                "phm_init_range": 0.0001,
+                                "use_bias_down_sampler": True,
+                                "use_bias_up_sampler": True,
+                            })
+
+AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['compacter++_t5-base'].update({
+                                "delta_type": "compacter",
+                                "learning_rate": 3e-3,
+                                "do_train": True,
+                                "do_eval": True,
+                                "do_test": True,
+                                "modified_modules": [
+                                    "DenseReluDense"
+                                ],
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/compacter++/t5-base/",
+                                "non_linearity": "gelu_new",
+
+                                #Compacter.
+                                "hypercomplex_division": 4,
+                                "hypercomplex_adapters": True,
+                                "hypercomplex_nonlinearity": "glorot-uniform",
+                                # gradient clip and clamp
+                                "gradient_clip": False,
+                                "phm_clamp": False,
+                                "normalize_phm_weight": False,
+                                "learn_phm": True,
+                                # shared one side
+                                "factorized_phm": True,
+                                "shared_phm_rule": False,
+                                "factorized_phm_rule": False,
+                                "phm_c_init": "normal",
+                                "phm_init_range": 0.0001,
+                                "use_bias_down_sampler": True,
+                                "use_bias_up_sampler": True,
+                            })
+
+
+AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['low_rank_adapter_t5-base'].update({
+                                "delta_type": "low_rank_adapter",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                    "layer_norm",
+                                    "final_layer_norm"
+                                ],
+                                "output_dir": "outputs/low_rank_adapter/t5-base/",
+                                "non_linearity": "gelu_new",
+                                "low_rank_w_init": "glorot-uniform",
+                                "low_rank_rank": 1,
+                            })
+
+
+AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['soft_prompt_t5-base'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-2,
+                                "soft_token_num":100,
+                                "token_init": False,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/t5-base/",
+                            })
+
+AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['prefix_t5-base'].update({
+                                "delta_type": "prefix",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "reparameterize": False,
+                                "output_dir": "outputs/prefix/t5-base/",
+                            })
+
+AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
+AllConfigs['soft_prompt_t5-base'].update({
+                                "delta_type": "soft_prompt",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/soft_prompt/t5-base/",
+                            })
+#### T5-base
+BaseConfigs['t5-small'] = {
+                ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
+                "max_source_length",
+                "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
+                    "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte",  "mnli", "qqp", "stsb"],
+                    ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
+                    [ 20,  20,  40,  20,   3,   3,  20,  20,  20,   3,   3,  20,   3,   3,  20],
+                    [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [ 32,  32,  32,  32,  32,  16,  32] + [32] * 8,
+                    [0] *7 +[0] *8,
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                    [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
+                ),
+                "do_train": True,
+                "do_eval": True,
+                "do_test": True,
+
+                "model_name_or_path": f"{PATHBASE}t5-small",
+                "tokenizer_name": f"{PATHBASE}t5-small",
+                "save_total_limit": 1,
+                # For glue datasets.
+                "split_validation_test": True,
+                "seed": 42,
+                "dataset_config_name": ["en"],
+                "eval_dataset_config_name": ["en"],
+                "test_dataset_config_name": ["en"],
+                # other configurations.
+                "predict_with_generate": True,
+                # To evaluate during training.
+                "load_best_model_at_end": True,
+                "metric_for_best_model": "average_metrics",
+                "greater_is_better": True,
+                "evaluation_strategy": "steps",
+                "overwrite_output_dir": True,
+                "push_to_hub": False,
+                "push_to_delta_center": True,
+                "save_strategy": "steps"
+            }
+
+AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
+AllConfigs['prefix_t5-small'].update({
+                                "delta_type": "prefix",
+                                "learning_rate": 3e-4,
+                                "unfrozen_modules": [
+                                    "deltas",
+                                ],
+                                "output_dir": "outputs/prefix/t5-small/",
+                            })
+
+
+if __name__ == "__main__":
+    import argparse
+    import json
+    import os
+    parser = argparse.ArgumentParser("Parser to generate configuration")
+    parser.add_argument("--job", type=str)
+    args = parser.parse_args()
+
+    config = AllConfigs[args.job]
+
+    Cartesian_product = []
+    for key in config:
+        if isinstance(key, tuple):
+            Cartesian_product.append(key)
+    all_config_jsons = {}
+    for key_tuple in Cartesian_product:
+        for zipped in config[key_tuple]:
+            job_name = zipped[0]
+            all_config_jsons[job_name] = {}
+            for key_name, zipped_elem in zip(key_tuple, zipped):
+                if key_name != 'job_name':
+                    all_config_jsons[job_name][key_name] = zipped_elem
+    for key in config:
+        if not isinstance(key, tuple):
+            for job_name in all_config_jsons:
+                if key == "output_dir":
+                    all_config_jsons[job_name][key] = config[key] + job_name
+                else:
+                    all_config_jsons[job_name][key] = config[key]
+
+
+    if not os.path.exists(f"configs/{args.job}/"):
+        os.mkdir(f"configs/{args.job}/")
+
+    for job_name in all_config_jsons:
+        with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
+            json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
+
+
+
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_beit-large-patch16-224/cifar10.json
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_beit-large-patch16-224/cifar10.json
@ -0,0 +1,52 @@
+{
+    "backbone_model": "beit",
+    "dataset_config_name": [
+        "en"
+    ],
+    "datasets_load_from_disk": true,
+    "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk",
+    "delta_type": "lora",
+    "do_eval": true,
+    "do_test": true,
+    "do_train": true,
+    "eval_dataset_config_name": [
+        "en"
+    ],
+    "eval_dataset_name": "cifar10",
+    "eval_steps": 200,
+    "evaluation_strategy": "steps",
+    "greater_is_better": true,
+    "learning_rate": 0.0003,
+    "load_best_model_at_end": true,
+    "max_source_length": 256,
+    "metric_for_best_model": "average_metrics",
+    "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224",
+    "model_path_public": "beit-large-patch16-224",
+    "num_classes": 10,
+    "num_train_epochs": 20,
+    "output_dir": "outputs/lora/beit-large-patch16-224/cifar10",
+    "overwrite_output_dir": true,
+    "per_device_eval_batch_size": 32,
+    "per_device_train_batch_size": 32,
+    "predict_with_generate": false,
+    "push_to_dc": true,
+    "push_to_hf": false,
+    "save_steps": 200,
+    "save_strategy": "steps",
+    "save_total_limit": 1,
+    "seed": 42,
+    "split_validation_test": true,
+    "task_name": "cifar10",
+    "test_dataset_config_name": [
+        "en"
+    ],
+    "test_dataset_name": "cifar10",
+    "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224",
+    "unfrozen_modules": [
+        "deltas",
+        "layer_norm",
+        "final_layer_norm"
+    ],
+    "warmup_steps": 0,
+    "modified_modules":["query","value"]
+}
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_gpt-j-6B/wikitext.json
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_gpt-j-6B/wikitext.json
@ -0,0 +1,52 @@
+{
+    "backbone_model": "gpt-j",
+    "dataset_config_name": [
+        "en"
+    ],
+    "datasets_load_from_disk": true,
+    "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
+    "delta_type": "lora",
+    "do_eval": true,
+    "do_test": true,
+    "do_train": true,
+    "eval_dataset_config_name": [
+        "en"
+    ],
+    "eval_dataset_name": "wikitext",
+    "eval_steps": 500,
+    "evaluation_strategy": "steps",
+    "gradient_accumulation_steps":4,
+    "greater_is_better": false,
+    "learning_rate": 0.00003,
+    "load_best_model_at_end": true,
+    "max_source_length": 512,
+    "metric_for_best_model": "average_metrics",
+    "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B",
+    "model_path_public": "gpt-j-6B",
+    "num_train_epochs": 2,
+    "output_dir": "outputs/lora/gpt-j-6B/wikitext",
+    "overwrite_output_dir": true,
+    "per_device_eval_batch_size": 2,
+    "per_device_train_batch_size": 2,
+    "predict_with_generate": true,
+    "push_to_dc": true,
+    "push_to_hf": false,
+    "save_steps": 500,
+    "save_strategy": "steps",
+    "save_total_limit": 1,
+    "seed": 42,
+    "split_validation_test": true,
+    "task_name": "wikitext",
+    "test_dataset_config_name": [
+        "en"
+    ],
+    "test_dataset_name": "wikitext",
+    "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B",
+    "unfrozen_modules": [
+        "deltas",
+        "layer_norm",
+        "final_layer_norm"
+    ],
+    "warmup_steps": 0,
+    "modified_modules":["20.attn.q_proj","21.attn.q_proj","22.attn.q_proj","23.attn.q_proj","24.attn.q_proj","25.attn.q_proj","26.attn.q_proj","27.attn.q_proj"]
+}
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_roberta-large/superglue-boolq.json
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_roberta-large/superglue-boolq.json
@ -0,0 +1,52 @@
+{
+    "backbone_model": "roberta-large",
+    "dataset_config_name": [
+        "en"
+    ],
+    "datasets_load_from_disk": true,
+    "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
+    "delta_type": "lora",
+    "do_eval": true,
+    "do_test": true,
+    "do_train": true,
+    "eval_dataset_config_name": [
+        "en"
+    ],
+    "eval_dataset_name": "superglue-boolq",
+    "eval_steps": 200,
+    "evaluation_strategy": "steps",
+    "greater_is_better": true,
+    "is_seq2seq": false,
+    "learning_rate": 0.0001,
+    "load_best_model_at_end": true,
+    "max_source_length": 256,
+    "metric_for_best_model": "average_metrics",
+    "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large",
+    "model_path_public": "roberta-large",
+    "num_train_epochs": 20,
+    "output_dir": "outputs/lora/roberta-large/superglue-boolq",
+    "overwrite_output_dir": true,
+    "per_device_eval_batch_size": 32,
+    "per_device_train_batch_size": 32,
+    "predict_with_generate": false,
+    "push_to_hub": false,
+    "push_to_dc": true,
+    "save_steps": 200,
+    "save_strategy": "steps",
+    "save_total_limit": 1,
+    "seed": 42,
+    "split_validation_test": true,
+    "task_name": "superglue-boolq",
+    "test_dataset_config_name": [
+        "en"
+    ],
+    "test_dataset_name": "superglue-boolq",
+    "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large",
+    "unfrozen_modules": [
+        "deltas",
+        "layer_norm",
+        "final_layer_norm"
+    ],
+    "warmup_steps": 0,
+    "modified_modules":["query","value"]
+}
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_xlm-roberta-large/superglue-wic.json
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/lora_xlm-roberta-large/superglue-wic.json
@ -0,0 +1,52 @@
+{
+    "backbone_model": "xlm-roberta-large",
+    "dataset_config_name": [
+        "en"
+    ],
+    "datasets_load_from_disk": true,
+    "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
+    "delta_type": "lora",
+    "do_eval": true,
+    "do_test": true,
+    "do_train": true,
+    "eval_dataset_config_name": [
+        "en"
+    ],
+    "eval_dataset_name": "superglue-wic",
+    "eval_steps": 100,
+    "evaluation_strategy": "steps",
+    "greater_is_better": true,
+    "is_seq2seq": false,
+    "learning_rate": 0.0003,
+    "load_best_model_at_end": true,
+    "max_source_length": 256,
+    "metric_for_best_model": "average_metrics",
+    "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large",
+    "model_path_public": "xlm-roberta-large",
+    "num_train_epochs": 20,
+    "output_dir": "outputs/lora/xlm-roberta-large/superglue-wic",
+    "overwrite_output_dir": true,
+    "per_device_eval_batch_size": 16,
+    "per_device_train_batch_size": 16,
+    "predict_with_generate": false,
+    "push_to_dc": true,
+    "push_to_hub": false,
+    "save_steps": 100,
+    "save_strategy": "steps",
+    "save_total_limit": 1,
+    "seed": 42,
+    "split_validation_test": true,
+    "task_name": "superglue-wic",
+    "test_dataset_config_name": [
+        "en"
+    ],
+    "test_dataset_name": "superglue-wic",
+    "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large",
+    "unfrozen_modules": [
+        "deltas",
+        "layer_norm",
+        "final_layer_norm"
+    ],
+    "warmup_steps": 0,
+    "modified_modules":["query","value"]
+}
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/low_rank_adapter_gpt2/wikitext.json
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/low_rank_adapter_gpt2/wikitext.json
@ -0,0 +1,52 @@
+{
+    "backbone_model": "gpt2",
+    "dataset_config_name": [
+        "en"
+    ],
+    "datasets_load_from_disk": true,
+    "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
+    "delta_type": "low_rank_adapter",
+    "do_eval": true,
+    "do_test": true,
+    "do_train": true,
+    "eval_dataset_config_name": [
+        "en"
+    ],
+    "eval_dataset_name": "wikitext",
+    "eval_steps": 200,
+    "evaluation_strategy": "steps",
+    "gradient_accumulation_steps":1,
+    "greater_is_better": false,
+    "learning_rate": 0.0003,
+    "load_best_model_at_end": true,
+    "max_source_length": 768,
+    "metric_for_best_model": "average_metrics",
+    "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt2",
+    "model_path_public": "gpt2",
+    "num_train_epochs": 2,
+    "output_dir": "outputs/low_rank_adapter/gpt2/wikitext",
+    "overwrite_output_dir": true,
+    "per_device_eval_batch_size": 16,
+    "per_device_train_batch_size": 16,
+    "predict_with_generate": true,
+    "push_to_dc": true,
+    "push_to_hf": false,
+    "save_steps": 200,
+    "save_strategy": "steps",
+    "save_total_limit": 1,
+    "seed": 42,
+    "split_validation_test": true,
+    "task_name": "wikitext",
+    "test_dataset_config_name": [
+        "en"
+    ],
+    "test_dataset_name": "wikitext",
+    "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt2",
+    "unfrozen_modules": [
+        "deltas",
+        "layer_norm",
+        "final_layer_norm"
+    ],
+    "warmup_steps": 0,
+    "modified_modules":["attn","mlp"]
+}
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/prefix_bert-large-cased/rte.json
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/prefix_bert-large-cased/rte.json
@ -0,0 +1,51 @@
+{
+    "backbone_model": "bert-large-cased",
+    "dataset_config_name": [
+        "en"
+    ],
+    "datasets_load_from_disk": true,
+    "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
+    "delta_type": "prefix",
+    "do_eval": true,
+    "do_test": true,
+    "do_train": true,
+    "eval_dataset_config_name": [
+        "en"
+    ],
+    "eval_dataset_name": "rte",
+    "eval_steps": 100,
+    "evaluation_strategy": "steps",
+    "greater_is_better": true,
+    "is_seq2seq": false,
+    "learning_rate": 0.0003,
+    "load_best_model_at_end": true,
+    "max_source_length": 256,
+    "metric_for_best_model": "average_metrics",
+    "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased",
+    "num_train_epochs": 20,
+    "output_dir": "outputs/prefix/bert-large-cased/rte",
+    "overwrite_output_dir": true,
+    "per_device_eval_batch_size": 16,
+    "per_device_train_batch_size": 16,
+    "predict_with_generate": false,
+    "push_to_dc": true,
+    "push_to_hub": false,
+    "save_steps": 100,
+    "save_strategy": "steps",
+    "save_total_limit": 1,
+    "seed": 42,
+    "split_validation_test": true,
+    "task_name": "rte",
+    "test_dataset_config_name": [
+        "en"
+    ],
+    "test_dataset_name": "rte",
+    "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased",
+    "unfrozen_modules": [
+        "deltas",
+        "layer_norm",
+        "final_layer_norm"
+    ],
+    "warmup_steps": 0,
+    "modified_modules":["attention"]
+}
--- a/OpenDelta-0.3.2/examples/examples_prompt/configs/soft_prompt_bart-large/superglue-boolq.json
+++ b/OpenDelta-0.3.2/examples/examples_prompt/configs/soft_prompt_bart-large/superglue-boolq.json
@ -0,0 +1,51 @@
+{
+    "backbone_model": "bart",
+    "dataset_config_name": [
+        "en"
+    ],
+    "datasets_load_from_disk": true,
+    "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
+    "delta_type": "soft_prompt",
+    "do_eval": true,
+    "do_test": true,
+    "do_train": true,
+    "eval_dataset_config_name": [
+        "en"
+    ],
+    "eval_dataset_name": "superglue-boolq",
+    "eval_steps": 500,
+    "evaluation_strategy": "steps",
+    "gradient_accumulation_steps":1,
+    "greater_is_better": true,
+    "learning_rate": 0.1,
+    "load_best_model_at_end": true,
+    "max_source_length": 256,
+    "metric_for_best_model": "average_metrics",
+    "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bart-large",
+    "model_path_public": "bart-large",
+    "num_train_epochs": 50,
+    "output_dir": "outputs/soft_prompt/bart-large/superglue-boolq",
+    "overwrite_output_dir": true,
+    "per_device_eval_batch_size": 32,
+    "per_device_train_batch_size": 32,
+    "predict_with_generate": true,
+    "push_to_dc": true,
+    "push_to_hf": false,
+    "save_steps": 500,
+    "save_strategy": "steps",
+    "save_total_limit": 1,
+    "seed": 42,
+    "soft_token_num":100,
+    "split_validation_test": true,
+    "task_name": "superglue-boolq",
+    "test_dataset_config_name": [
+        "en"
+    ],
+    "test_dataset_name": "superglue-boolq",
+    "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bart-large",
+    "token_init": true,
+    "unfrozen_modules": [
+        "deltas"
+    ],
+    "warmup_steps": 0
+}
--- a/OpenDelta-0.3.2/examples/examples_prompt/data_processors/init.py
+++ b/OpenDelta-0.3.2/examples/examples_prompt/data_processors/init.py
@ -0,0 +1,3 @@
+from .tasks import TASK_MAPPING, AutoTask
+# from .data_collator import TaskDataCollatorForSeq2Seq
+# from .postprocessors import AutoPostProcessor
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
wql	940a8b08cb	add: add opendelta package	2024-08-08 13:44:48 +08:00
wql	9175f7c9bb	fix: change to GPUS_PER_NODE=1	2024-08-08 09:18:29 +08:00
wql	7655337c72	fix: change to GPUS_PER_NODE=2	2024-08-08 09:03:43 +08:00
wql	12f7320b51	fix: change GPUS_PER_NODE to 8	2024-08-07 16:49:33 +08:00
wql	db532ca4b1	fix: modify paras in pretrain_dragonfly	2024-08-07 16:00:45 +08:00
wql	a62a188fd5	fix: change --language to zh	2024-08-07 14:21:33 +08:00
wql	5de2ff4556	test	2024-08-07 14:03:19 +08:00