commit b856ad0fb93eb77bd9b76de11992851f44579556 Author: shengdinghu Date: Mon Feb 14 21:19:03 2022 +0800 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e099ba4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +data/ +**/__pycache__/ +logs/* +experiments/logs +!logs/.gitkeep +datasets/* +!datasets/*.sh +.vscode/ +*.egg-info/ +eggs/ +.eggs/ +*.egg +**.egg +build/ +_build/ +**/build/ +outputs/ +log.txt +**/DeltaHub/ +*beans \ No newline at end of file diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..7f43aea --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,29 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 1 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-20.04 + tools: + python: "3.9" + # You can also specify other tool versions: + # nodejs: "16" + # rust: "1.55" + # golang: "1.17" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + +# If using Sphinx, optionally build your docs in additional formats such as PDF +# formats: +# - pdf + +# Optionally declare the Python requirements required to build your docs +python: + install: + - requirements: docs/requirements.txt \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..6d3a9f7 --- /dev/null +++ b/README.md @@ -0,0 +1,94 @@ +
+ + + + +**An Open-Source Framework for Paramter Efficient Tuning.** + +------ + +

+ Overview • + Installation • + Supported Models • + Docs • + Performance • + + +

+ +
+ +![version](https://img.shields.io/badge/version-v0.1.0-blue) + +## Overview + +OpenDelta is a toolkit for parameter efficient methods (we dub it as *delta tuning*), by which users could flexibly assign (or add) a small amount parameters to update while keeping the most paramters frozen. By using OpenDelta, users could easily implement prefix-tuning, adapters, Lora, or any other types of delta tuning with preferred PTMs. + +## Installation +create a virtualenv (optional) +```shell +conda create -n opendelta_env python=3.8 +conda activate opendelta_env +``` + +### Using Pip + +Our repo is tested on Python 3.6+ and PyTorch 1.8.1+, install OpenDelta using pip as follows: + +```shell +pip install opendelta +``` + +To play with the latest features, you can also install OpenDelta from the source. + +### Build from Source + +```shell +git clone https://github.com/thunlp/OpenDelta.git +cd OpenDelta +``` + +#### Option 1: If you won't modify the code, run +```shell +python setup.py install +``` + +#### Option 2: If you want to modify the code, run +```shell +python setup.py develop +``` + + + +### Verified Supported Models + +** You can try to use OpenDelta on any backbone models based on PyTorch.** However, with small chances that +The interface of the submodules of the backbone model is not supported. Therefore we verified some commonly +used models that OpenDelta are sure to support. + +We will keep testing more and more emerging models. + +Pull requests are welcomed when you successfully apply OpenDelta on your own backbone model. + + +| | Lora | Bias
Tuning | Adapter
Houstbly | Adapter
Preffier | Adapter
Drop | Adapater
Low-Rank | Compactor |Prefix
Tuning | Prompt
Tuning | +| --------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ----- | ----- | +| T5 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| GPT-2 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | +| BART | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | +| DistilBERT | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | +| RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | +| BERT | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| T5-3b(parallel)| ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| Deberta-v2 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | +| CTRL | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | +| ViT | ✅ | | | | | | | | | + + +### Performance Checked Combination + +Google sheet [here](https://docs.google.com/spreadsheets/d/1BIVa8ocAPga-u7rBOXLYaTfaJSjI1dWfwohmLjmFDrY/edit?usp=sharing) + + + diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..6fcf05b --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/readme.md b/docs/readme.md new file mode 100644 index 0000000..a9e75df --- /dev/null +++ b/docs/readme.md @@ -0,0 +1,20 @@ +# OpenDelta Documentation + +To build this doc locally, please firstly install [sphinx](https://www.sphinx-doc.org/en/master/) packages. + +``` +pip install sphinx +pip install sphinx_rtd_theme +pip install sphinx_copybutton +pip install sphinx_toolbox +pip install myst_parser +``` + +Then install opendelta either from source, or from pip. After that, + +``` +cd docs +make html +``` + +Then open the generated `docs/build/html/index.html` in your local browser. \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..2f0ef65 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,13 @@ +sphinx_copybutton +sphinx_rtd_theme +sphinx_toolbox +torch +transformers +sentencepiece==0.1.96 +tqdm==4.62.2 +openprompt +loralib +decorator +rich +myst_parser +web.py \ No newline at end of file diff --git a/docs/source/_static/css/custom.css b/docs/source/_static/css/custom.css new file mode 100644 index 0000000..9cfdbbe --- /dev/null +++ b/docs/source/_static/css/custom.css @@ -0,0 +1,268 @@ +/* a, */ +.wy-menu-vertical header, +.wy-menu-vertical p.caption, +.wy-nav-top .fa-bars, +.wy-menu-vertical a:hover, + +/* Colors and text decoration. + For example, :black:`text in black` or :blink:`text blinking` in rST. */ + + /* .black { + color: black; +} + +.gray { + color: gray; +} + +.grey { + color: gray; +} + +.silver { + color: silver; +} + +.white { + color: white; +} + +.maroon { + color: maroon; +} + +.red { + color: red; +} + +.magenta { + color: magenta; +} + +.fuchsia { + color: fuchsia; +} + +.pink { + color: pink; +} + +.orange { + color: rgba(218, 135, 12, 0.897); +} */ + +/* .string { + color: rgb(172, 51, 44); +} */ + +/* .yellow { + color: yellow; +} + +.lime { + color: lime; +} + +.green { + color: green; +} + +.olive { + color: olive; +} + +.teal { + color: teal; +} + +.cyan { + color: cyan; +} + +.aqua { + color: aqua; +} + +.blue { + color: blue; +} + +.navy { + color: navy; +} + +.purple { + color: purple; +} + +.under { + text-decoration: underline; +} + +.over { + text-decoration: overline; +} + +.blink { + text-decoration: blink; +} + +.line { + text-decoration: line-through; +} + +.strike { + text-decoration: line-through; +} + +.it { + font-style: italic; +} + +.ob { + font-style: oblique; +} + +.small { + font-size: small; +} + +.large { + font-size: large; +} + +.smallpar { + font-size: small; +} */ + +a:link { + color: rgb(141, 99, 224) +} + +a:visited { + color: rgb(141, 99, 224) +} + +a:hover { + color: rgb(147, 47, 218) +} +.rst-content code.literal +{ + color: rgb(172, 49, 42) !important; + /* #5360f0 */ +} + +.rst-content tt.literal +{ + color: #f06b53 !important; +} +/* #a153f0 */ +/* inspired by sphinx press theme */ +.wy-menu.wy-menu-vertical li.toctree-l1.current > a { + border-left: solid 15px rgb(150, 92, 232) !important; + text-indent: -15px; + border-top: none; + border-bottom: none; +} + +.wy-menu.wy-menu-vertical li.toctree-l1.current > ul { + border-left: solid 15px #ddcaf7 !important; +} +/* inspired by sphinx press theme */ + +.wy-nav-side { + color: unset !important; + background: unset !important; + border-right: solid 1px #ccc !important; +} + +.wy-side-nav-search, +.wy-nav-top, +.wy-menu-vertical li, +.wy-menu-vertical li a:hover, +.wy-menu-vertical li a +{ + background: unset !important; +} + +.wy-menu-vertical li.current a { + border-right: unset !important; +} + +.wy-side-nav-search div, +.wy-menu-vertical a { + color: #404040 !important; +} + +.wy-menu-vertical button.toctree-expand { + color: #333 !important; +} + +.wy-nav-content { + max-width: unset; +} + +.rst-content { + max-width: 900px; +} + +.wy-nav-content .icon-home:before { + content: "Docs"; +} + +.wy-side-nav-search .icon-home:before { + content: ""; +} + +dl.field-list { + display: block !important; +} + +dl.field-list > dt:after { + content: "" !important; +} + +dl.field-list > dt { + display: table; + padding-left: 6px !important; + padding-right: 6px !important; + margin-bottom: 4px !important; + padding-bottom: 1px !important; + background: rgb(252, 237, 208); + border-left: solid 2px rgb(231, 181, 134); +} + + +dl.py.class>dt +{ + color: rgba(17, 16, 17, 0.822) !important; + background: rgb(247, 234, 252) !important; + border-top: solid 2px #b620d0 !important; +} + +dl.py.method>dt +{ + background: rgb(250, 239, 241) !important; + border-left: solid 2px rgb(199, 83, 106) !important; +} + +dl.py.attribute>dt, +dl.py.property>dt +{ + background: rgba(194, 233, 248, 0.1) !important; + border-left: solid 2px #58b5cc !important; +} + +.fa-plus-square-o::before, .wy-menu-vertical li button.toctree-expand::before, +.fa-minus-square-o::before, .wy-menu-vertical li.current > a button.toctree-expand::before, .wy-menu-vertical li.on a button.toctree-expand::before +{ + content: ""; +} + +.rst-content .viewcode-back, +.rst-content .viewcode-link +{ + font-size: 120%; +} + + diff --git a/docs/source/_static/js/custom.js b/docs/source/_static/js/custom.js new file mode 100644 index 0000000..489b7d5 --- /dev/null +++ b/docs/source/_static/js/custom.js @@ -0,0 +1,7 @@ +document.addEventListener("DOMContentLoaded", function(event) { + document.querySelectorAll(".wy-menu.wy-menu-vertical > ul.current > li > a").forEach(a => a.addEventListener("click", e=>{ + f = document.querySelector(".wy-menu.wy-menu-vertical > ul.current > li > ul") + if (f.style.display=='none') { f.style.display='block'; } else f.style.display = 'none' + })); + document.querySelectorAll(".headerlink").forEach(a => a.text="\u{1F517}"); +}); \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..8408041 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,144 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) +import sys +sys.path.insert(0, "../../") +import datetime +import sphinx_rtd_theme +import doctest +import opendelta +import opendelta.delta_models + +# -- Project information ----------------------------------------------------- + +project = 'OpenDelta' +author = 'THUNLP OpenDelta Team' +copyright = '{}, {}, Licenced under the Apache License, Version 2.0'.format(datetime.datetime.now().year, author) + + +# The full version, including alpha/beta/rc tags +release = '0.1.1' +version = "0.1.1" + +html_theme = 'sphinx_rtd_theme' +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +doctest_default_flags = doctest.NORMALIZE_WHITESPACE +autodoc_member_order = 'bysource' +intersphinx_mapping = {'python': ('https://docs.python.org/', None), +"torch": ("https://pytorch.org/docs/stable/", None),} + +html_show_sourcelink = True + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.doctest', + 'sphinx.ext.intersphinx', + 'sphinx.ext.mathjax', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'sphinx.ext.githubpages', + 'sphinx_copybutton', + 'sphinx_toolbox.collapse', + 'myst_parser', +] + +myst_enable_extensions = [ + "html_image", + "colon_fence", + "html_admonition", + "amsmath", + "dollarmath", +] + +source_suffix = { + '.rst': 'restructuredtext', + '.txt': 'markdown', + '.md': 'markdown', +} + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +# exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +# html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_theme_options = { + # 'collapse_navigation': False, + # 'display_version': True, + #'logo_only': False, + 'navigation_depth': 2, +} + + +html_static_path = ['_static'] +html_css_files = ['css/custom.css'] +html_js_files = ['js/custom.js'] +rst_context = {'opendelta': opendelta} +# rst_epilog = "\n.. include:: .special.rst\n" +add_module_names = False + +def include_only_tagged(app, what, name, obj, skip, options): + inclusion_tag_format = "[NODOC]" #can be any pattern here, choose what works for you + for tag in app.tags.tags: + if obj.__doc__ is not None and not obj.__doc__.startswith(inclusion_tag_format): + return False + return True + +def skip2(app, what, name, obj, skip, options): + members = [ + '__init__', + '__repr__', + '__weakref__', + '__dict__', + '__module__', + ] + return True if name in members else skip + +def skip(app, what, name, obj, skip, options): + skip = include_only_tagged(app, what, name, obj, skip, options) or\ + skip2(app, what, name, obj, skip, options) + return skip + +def setup(app): + + + + def rst_jinja_render(app, docname, source): + src = source[0] + rendered = app.builder.templates.render_string(src, rst_context) + source[0] = rendered + + app.connect('autodoc-skip-member', skip) + app.connect("source-read", rst_jinja_render) \ No newline at end of file diff --git a/docs/source/imgs/afterfreeze.png b/docs/source/imgs/afterfreeze.png new file mode 100644 index 0000000..5d37408 Binary files /dev/null and b/docs/source/imgs/afterfreeze.png differ diff --git a/docs/source/imgs/bart-base.png b/docs/source/imgs/bart-base.png new file mode 100644 index 0000000..52b023d Binary files /dev/null and b/docs/source/imgs/bart-base.png differ diff --git a/docs/source/imgs/bert_vis.png b/docs/source/imgs/bert_vis.png new file mode 100644 index 0000000..a64c175 Binary files /dev/null and b/docs/source/imgs/bert_vis.png differ diff --git a/docs/source/imgs/bertdelta_noparam.png b/docs/source/imgs/bertdelta_noparam.png new file mode 100644 index 0000000..3f1cdf7 Binary files /dev/null and b/docs/source/imgs/bertdelta_noparam.png differ diff --git a/docs/source/imgs/bertdelta_vis.png b/docs/source/imgs/bertdelta_vis.png new file mode 100644 index 0000000..e21cf20 Binary files /dev/null and b/docs/source/imgs/bertdelta_vis.png differ diff --git a/docs/source/imgs/commonstructure_vis.png b/docs/source/imgs/commonstructure_vis.png new file mode 100644 index 0000000..e5db4e1 Binary files /dev/null and b/docs/source/imgs/commonstructure_vis.png differ diff --git a/docs/source/imgs/composition_of_delta.png b/docs/source/imgs/composition_of_delta.png new file mode 100644 index 0000000..b33a060 Binary files /dev/null and b/docs/source/imgs/composition_of_delta.png differ diff --git a/docs/source/imgs/defaultmodification.png b/docs/source/imgs/defaultmodification.png new file mode 100644 index 0000000..a729ccb Binary files /dev/null and b/docs/source/imgs/defaultmodification.png differ diff --git a/docs/source/imgs/hint-icon-2.jpg b/docs/source/imgs/hint-icon-2.jpg new file mode 100644 index 0000000..0d9a0c6 Binary files /dev/null and b/docs/source/imgs/hint-icon-2.jpg differ diff --git a/docs/source/imgs/hint-icon.png b/docs/source/imgs/hint-icon.png new file mode 100644 index 0000000..83ebb44 Binary files /dev/null and b/docs/source/imgs/hint-icon.png differ diff --git a/docs/source/imgs/interact.jpg b/docs/source/imgs/interact.jpg new file mode 100644 index 0000000..0cbcee7 Binary files /dev/null and b/docs/source/imgs/interact.jpg differ diff --git a/docs/source/imgs/multiple_to_one_layer.png b/docs/source/imgs/multiple_to_one_layer.png new file mode 100644 index 0000000..1df3e24 Binary files /dev/null and b/docs/source/imgs/multiple_to_one_layer.png differ diff --git a/docs/source/imgs/name_based_addressing.png b/docs/source/imgs/name_based_addressing.png new file mode 100644 index 0000000..c341a3d Binary files /dev/null and b/docs/source/imgs/name_based_addressing.png differ diff --git a/docs/source/imgs/plugunplug1.png b/docs/source/imgs/plugunplug1.png new file mode 100644 index 0000000..7dc17f1 Binary files /dev/null and b/docs/source/imgs/plugunplug1.png differ diff --git a/docs/source/imgs/plugunplug2.png b/docs/source/imgs/plugunplug2.png new file mode 100644 index 0000000..1330350 Binary files /dev/null and b/docs/source/imgs/plugunplug2.png differ diff --git a/docs/source/imgs/plugunplug3.png b/docs/source/imgs/plugunplug3.png new file mode 100644 index 0000000..3f6aa5d Binary files /dev/null and b/docs/source/imgs/plugunplug3.png differ diff --git a/docs/source/imgs/plugunplug4.png b/docs/source/imgs/plugunplug4.png new file mode 100644 index 0000000..a0a6e24 Binary files /dev/null and b/docs/source/imgs/plugunplug4.png differ diff --git a/docs/source/imgs/plugunplug5.png b/docs/source/imgs/plugunplug5.png new file mode 100644 index 0000000..c7b2dcc Binary files /dev/null and b/docs/source/imgs/plugunplug5.png differ diff --git a/docs/source/imgs/plugunplug6.png b/docs/source/imgs/plugunplug6.png new file mode 100644 index 0000000..7adf668 Binary files /dev/null and b/docs/source/imgs/plugunplug6.png differ diff --git a/docs/source/imgs/pointing-right-finger.png b/docs/source/imgs/pointing-right-finger.png new file mode 100644 index 0000000..6216065 Binary files /dev/null and b/docs/source/imgs/pointing-right-finger.png differ diff --git a/docs/source/imgs/raw_print.png b/docs/source/imgs/raw_print.png new file mode 100644 index 0000000..836b2de Binary files /dev/null and b/docs/source/imgs/raw_print.png differ diff --git a/docs/source/imgs/t5lora.png b/docs/source/imgs/t5lora.png new file mode 100644 index 0000000..1d78cdb Binary files /dev/null and b/docs/source/imgs/t5lora.png differ diff --git a/docs/source/imgs/todo-icon.jpeg b/docs/source/imgs/todo-icon.jpeg new file mode 100644 index 0000000..9846fc2 Binary files /dev/null and b/docs/source/imgs/todo-icon.jpeg differ diff --git a/docs/source/imgs/toy-delta.png b/docs/source/imgs/toy-delta.png new file mode 100644 index 0000000..ab32640 Binary files /dev/null and b/docs/source/imgs/toy-delta.png differ diff --git a/docs/source/imgs/transformers_structure.png b/docs/source/imgs/transformers_structure.png new file mode 100644 index 0000000..ded54d9 Binary files /dev/null and b/docs/source/imgs/transformers_structure.png differ diff --git a/docs/source/index.md b/docs/source/index.md new file mode 100644 index 0000000..e62cfb6 --- /dev/null +++ b/docs/source/index.md @@ -0,0 +1,54 @@ +OpenDelta's documentation! +===================================== + +OpenDelta is a **Plug-and-play** Library of the parameter-efficient fine-tuning ([delta-tuning](WhatisDelta)) technology for pre-trained models. + + +## Essential Advantages: + +- Clean: No need to edit the backbone PTM’s codes. +- Simple: Migrating from full-model tuning to delta-tuning needs as little as 3 lines of codes. +- Sustainable: Most evolution in external library doesn’t require a new OpenDelta. +- Extendable: Various PTMs can share the same delta-tuning codes. +- Flexible: Able to apply delta-tuning to (almost) any position of the PTMs. + +```{eval-rst} +.. toctree:: + :maxdepth: 1 + :caption: Getting Started + + notes/overview.md + notes/installation.md + notes/usage.md + notes/visualization.md + notes/saveload.md + +.. toctree:: + :maxdepth: 1 + :caption: Advanced Usage + + notes/keyfeature.md + notes/unifyname.md + notes/autodelta.md + notes/composition.md + notes/pluginunplug.md + notes/acceleration.md + notes/explored_config.md + notes/citation.md + +.. toctree:: + :maxdepth: 2 + :caption: Package Reference + + modules/base + modules/deltas + modules/auto_delta + modules/utils + + +Indices and tables +================== + +* :ref:`genindex` + +``` \ No newline at end of file diff --git a/docs/source/modules/auto_delta.rst b/docs/source/modules/auto_delta.rst new file mode 100644 index 0000000..cc9d3d4 --- /dev/null +++ b/docs/source/modules/auto_delta.rst @@ -0,0 +1,14 @@ +Auto Classes +====================================== + + +AutoDeltaConfig +------------------------------------ +.. autoclass:: opendelta.auto_delta.AutoDeltaConfig + :members: + + +AutoDeltaModel +------------------------------------ +.. autoclass:: opendelta.auto_delta.AutoDeltaModel + :members: diff --git a/docs/source/modules/base.rst b/docs/source/modules/base.rst new file mode 100644 index 0000000..3a1a35e --- /dev/null +++ b/docs/source/modules/base.rst @@ -0,0 +1,14 @@ +Base Classes +====================================== + + +BaseDeltaConfig +------------------------------------ +.. autoclass:: opendelta.delta_configs.BaseDeltaConfig + :members: + + +DeltaBase +------------------------------------ +.. autoclass:: opendelta.basemodel.DeltaBase + :members: diff --git a/docs/source/modules/deltas.rst b/docs/source/modules/deltas.rst new file mode 100644 index 0000000..5a94fb6 --- /dev/null +++ b/docs/source/modules/deltas.rst @@ -0,0 +1,46 @@ +Delta Models +====================================== + + + +Lora +--------------------------------------- +.. autoclass:: opendelta.LoraModel + :members: + + + +BitFit +--------------------------------------- +.. autoclass:: opendelta.BitFitModel + :members: + + +Adapter +--------------------------------------- +.. autoclass:: opendelta.AdapterModel + :members: + + +LowRankAdapter +--------------------------------------- +.. autoclass:: opendelta.LowRankAdapterModel + :members: + + +Compacter +--------------------------------------- +.. autoclass:: opendelta.CompacterModel + :members: + + +Prefix tuning +------------------------------------ +.. autoclass:: opendelta.PrefixModel + :members: + + +Soft Prompt Tuning +------------------------------------ +.. autoclass:: opendelta.SoftPromptModel + :members: diff --git a/docs/source/modules/utils.md b/docs/source/modules/utils.md new file mode 100644 index 0000000..3d11305 --- /dev/null +++ b/docs/source/modules/utils.md @@ -0,0 +1,45 @@ +# Utils + + +## SaveLoadMixin + +```{eval-rst} +.. autoclass:: opendelta.utils.saving_loading_utils.SaveLoadMixin + :members: +``` + +## Visualization + + +```{eval-rst} +.. autoclass:: opendelta.utils.visualization.Visualization + :members: +``` + +## Structure Map +```{eval-rst} +.. autoclass:: opendelta.utils.structure_mapping.CommonStructureMap + :members: +``` + +## Utility Functions + +### Hashing +```{eval-rst} +.. automodule:: opendelta.utils.model_md5 + :members: +``` + +### Signature +```{eval-rst} +.. automodule:: opendelta.utils.signature + :members: +``` + +### Named-based addressing +```{eval-rst} +.. automodule:: opendelta.utils.name_based_addressing + :members: +``` + + diff --git a/docs/source/notes/acceleration.md b/docs/source/notes/acceleration.md new file mode 100644 index 0000000..6b088b4 --- /dev/null +++ b/docs/source/notes/acceleration.md @@ -0,0 +1,6 @@ + +(acceleration)= +# OpenDelta+ + We are working on testing and improving the functionality with work with other acceleration packages for model training and inference. For example, [deepspeed](https://github.com/microsoft/DeepSpeed), [BMInf](https://github.com/OpenBMB/BMInf). + +Feel free to contact us via email (shengdinghu@gmail.com) if you have any suggestion. diff --git a/docs/source/notes/autodelta.md b/docs/source/notes/autodelta.md new file mode 100644 index 0000000..eb9fb4e --- /dev/null +++ b/docs/source/notes/autodelta.md @@ -0,0 +1,67 @@ +(autodelta)= +# AutoDelta Mechanism + +Inspired by [Huggingface transformers AutoClasses](https://huggingface.co/docs/transformers/v4.16.2/en/model_doc/auto#transformers.AutoModel) , we provide an AutoDelta features for the users to + +1. Easily to experiment with different delta models +2. Fast deploy from configuration file, especially from the repos in [DeltaHub](https://huggingface.co/DeltaHub). + + +## Easily load from dict, so that subject to change the type of delta models. + +```python +from opendelta import AutoDeltaConfig, AutoDeltaModel +from transformers import T5ForConditionalGeneration + +backbone_model = T5ForConditionalGeneration.from_pretrained("t5-base") +``` + +We can load a config from a dict +```python +config_dict = { + "delta_type":"lora", + "modified_modules":[ + "SelfAttention.q", + "SelfAttention.v", + "SelfAttention.o" + ], + "lora_r":4} +delta_config = AutoDeltaConfig.from_dict(config_dict) +``` + +Then use the config to add a delta model to the backbone model +```python +delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=backbone_model) + +# now visualize the modified backbone_model +from opendelta import Visualization +Visualizaiton(backbone_model).structure_graph() +``` + + +````{collapse} Click to view output +```{figure} ../imgs/t5lora.png +--- +width: 600px +name: t5lora +--- +``` +```` + + + +## Fast deploy from a finetuned delta checkpoints from DeltaHub + +```python +delta_model = AutoDeltaModel.from_finetuned("DeltaHub/sst2-t5-base", backbone_model=backbone_model) # TODO: the link may change. +``` + +
+

**Hash checking**

+Since the delta model only works together with the backbone model. +we will automatically check whether you load the delta model the same way it is trained. +

+

+We calculate the trained model's [md5](http://some_link) and save it to the config. When finishing loading the delta model, we will re-calculate the md5 to see whether it changes. +

Pass `check_hash=False` to disable the hash checking.

+
\ No newline at end of file diff --git a/docs/source/notes/citation.md b/docs/source/notes/citation.md new file mode 100644 index 0000000..4c41201 --- /dev/null +++ b/docs/source/notes/citation.md @@ -0,0 +1,3 @@ +# Citation + + We are working on a technical report. \ No newline at end of file diff --git a/docs/source/notes/composition.md b/docs/source/notes/composition.md new file mode 100644 index 0000000..151aa37 --- /dev/null +++ b/docs/source/notes/composition.md @@ -0,0 +1,52 @@ +(composition)= +# Composition of delta models + +With OpenDelta, you can perform compostion of different delta models. + + +### Add different deltas to the backbone + +``` +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("roberta-base") +from opendelta import LoraModel, AdapterModel +delta_model = LoraModel(backbone_model=model, modified_modules=['key'], lora_r=1) +delta_model2 = AdapterModel(backbone_model=model, modified_modules=['output'], bottleneck_dim=12) +delta_model.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/composition_of_delta.png +--- +width: 600px +name: defaultmodification +--- +``` +```` + + + +### Even add multiple delta to the same layer + +``` +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base") +from opendelta import AdapterModel, LowRankAdapterModel +delta_model = AdapterModel(backbone_model=model, modified_modules=['fc2']) +delta_model2 = AdapterModel(backbone_model=model, modified_modules=['fc2'], bottleneck_dim=12) +delta_model3 = LowRankAdapterModel(backbone_model=model, modified_modules=['fc2'], reduction_factor=12) +delta_model.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/multiple_to_one_layer.png +--- +width: 600px +name: defaultmodification +--- +``` +```` +:::{admonition} Order of Insertion +:class: warning +**When adding to the same layer, please pay attention to the order of adding delta.** As the above example, adapter is added after the `fc2`, the tensor will first go through `adapter` then go through `adapter_1`, at last `compacter`. If the delta is added before the backbone layer, then the last added delta will be the first to go through. + +Also, pay attention to the detaching order. The delta that is first added should be the last to be detached. +::: \ No newline at end of file diff --git a/docs/source/notes/explored_config.md b/docs/source/notes/explored_config.md new file mode 100644 index 0000000..34bd1f4 --- /dev/null +++ b/docs/source/notes/explored_config.md @@ -0,0 +1,11 @@ +(favoredconfiguration)= +# Favored Configuration + + We will add the commonly used configuration of delta models HERE in future. + +E.g. +- the modified_modules (position of delta), +- hyperparameter that are the most efficient +- the favored composition between delta models + +Currenlty, use the default setting, explore it by yourself, or refer to existing papers' configuration! \ No newline at end of file diff --git a/docs/source/notes/installation.md b/docs/source/notes/installation.md new file mode 100644 index 0000000..967aa43 --- /dev/null +++ b/docs/source/notes/installation.md @@ -0,0 +1,24 @@ + +(installation)= +# Installation + + +OpenDelta is tested on on [Python 3.8](https://www.python.org/) and [Pytorch 1.9](). + +```bash +pip install opendelta +``` + +or from the source +```bash +git clone +cd OpenDelta +python setup.py install +``` + +If you want to do some modifications on the code for your research, run +```bash +git clone +cd OpenDelta +python setup.py develop +``` \ No newline at end of file diff --git a/docs/source/notes/keyfeature.md b/docs/source/notes/keyfeature.md new file mode 100644 index 0000000..b79367a --- /dev/null +++ b/docs/source/notes/keyfeature.md @@ -0,0 +1,200 @@ +(keyfeature)= +# Philosophy and Key Features + +:::{admonition} Plug-and-play Design. +:class: tip + +Existing open-source project to propogate this **''delta-tuning''** paradigm includes +AdapterHub, which copies the transformers code base and modify on it, which makes it unintuitive to transfer from a normal code base to a delta-tuning ones. + +OpenDelta approaches this problem via a **true plug-and-play** fashion to the PLMs. To migrate from a full-model finetuning training scripts to a delta tuning training scripts, you **DO NOT** need to change the backbone bone model code base to an adapted code base. +::: + + +Here is how we achieve it. + + **Read through it will also help you to implement your own delta models in a sustainable way.** + +(namebasedaddr)= +## 1. Name-based submodule addressing. +We locate the submodules that we want to apply a delta layer via name-based addressing. + +In pytorch fashion, a submodule can be accessed from a root model via 'dot' addressing. For example, we define a toy language model + +```python +import torch.nn as nn +class MyNet1(nn.Module): + def __init__(self,): + super().__init__() + self.name_a = nn.Linear(5,5) + def forward(self, hiddens): + return self.name_a(hiddens) + +class MyNet2(nn.Module): + def __init__(self,): + super().__init__() + self.embedding = nn.Embedding(10,5) + self.name_b = nn.Sequential(MyNet1(), MyNet1()) + def forward(self, input_ids): + hiddens = self.embedding(input_ids) + return self.name_b(hiddens) + +root = MyNet2() +print(root.name_b[0].name_a) +# Linear(in_features=5, out_features=5, bias=True) +``` + +We can visualize the model (For details, see [visualization](visualization)) + +```python +from opendelta import Visualization +Visualization(root).structure_graph() +``` + +````{collapse} Click to view output +```{figure} ../imgs/name_based_addressing.png +--- +width: 500px +name: name_based_addressing +--- +``` +```` + +In this case, string `"name_b.0.name_a"` will be the name to address the submodule from the root model. + +Thus when applying a delta model to this toy net. + +``` +from opendelta import AdapterModel +AdapterModel(backbone_model=root, modified_modules=['name_b.0.name_a']) +Visualization(root).structure_graph() +``` + +````{collapse} Click to view output +```{figure} ../imgs/toy-delta.png +--- +width: 500px +name: toy-delta +--- +``` +```` + +### Makes addressing easier. + +Handcrafting the full names of submodules can be frustrating. We made some simplifications + +1. End-matching Rules. + + OpenDelta will take every modules that + **ends with** the provided name suffix as the modification [target module](target_module). + :::{admonition} Example + :class: tip + Taking DistilBert with an classifier on top as an example: + - set to `["0.attention.out_lin"]` will add delta modules to the attention output of distilbert's + ayer 0, i.e., `distilbert.transformer.layer.0.attention.out_lin`. + - set to `["attention.out_lin"]` will add the delta modules in every layer's `attention.out_lin`. + ::: + + +2. Regular Expression. + Unit test and Doc later. + +3. Interactive Selection. + + We provide a way to interact visually to select modules needed. + + ```python + from transformers import BertForMaskedLM + model = BertForMaskedLM.from_pretrained("bert-base-cased") + # suppose we load BERT + + from opendelta import LoraModel # use lora as an example, others are same + delta_model = LoraModel(backbone_model=model, interactive_modify=True) + ``` + + by setting `interactive_modify`, a web server will be opened on local host, and the link will be print in the terminal. + + ``` + http://0.0.0.0:8888/ + ``` + + If on your local machine, click to open the link for interactive modification. + + If on remote host, you could use port mapping. For example, vscode terminal will automatically do port mapping for you, you can simply use `control/command + click` to open the link. + + You can change the port number in case the default port number is occupied by other program by setting `interactive_modify=port_number`, in which port_number is an integer. + + The web page looks like the following figure. + + ```{figure} ../imgs/interact.jpg + --- + width: 500px + name: interact web page + --- + ``` + + - By clicking on `[+]`/`[-]` to expand / collapse tree nodes. + + - By clicking on text to select tree nodes, **yellow dotted** box indicates the selection. + + - **Double** click on the pink `[*]` is an advanced option to unfold the repeated nodes. By default, modules with the same architecture are folded into one node and are marked in red, for example, the `BertLayer` of layers 0~11 in the above figure are in the same structure. Regular model changes will make the same changes to each layers. + + - If you want to change only a few of them, first double-click on `[*]`, then select the parts you want in the unfolded structure. + + - If you want to make the same change to all but a few of them, first select the common parts you want in the folded structure, then double-click on `[*]` to remove the few positions you don't need to change in the expanded structure. + + Click `submit` button on the top-right corner, then go back to your terminal, you can get a list of name-based addresses printed in the terminal in the following format, and these modules are being "delta". + + ``` + modified_modules: + [bert.encoder.layer.0.output.dense, ..., bert.encoder.layer.11.output.dense] + ``` + +## 2. Three basic submodule-level delta operations. +We use three key functions to achieve the modifications to the backbone model outside the backbone model's code. + +1. **unfreeze some paramters** + + Some delta models will unfreeze a part of the model parameters and freeze other parts of the model, e.g. [BitFit](https://arxiv.org/abs/2106.10199). For these methods, just use [freeze_module](opendelta.basemodel.DeltaBase.freeze_module) method and pass the delta parts into `exclude`. + +2. **replace an module** + + Some delta models will replace a part of the model with a delta model, i.e., the hidden states will no longer go through the original submodules. This includes [Lora](https://arxiv.org/abs/2106.09685). + For these methods, we have an [update_module](opendelta.basemodel.DeltaBase.replace_module) interface. + +3. **insertion to the backbone** + + - **sequential insertion** + + Most adapter model insert a new adapter layer after/before the original transformers blocks. For these methods, insert the adapter's forward function after/before the original layer's forward function using [insert_sequential_module](opendelta.basemodel.DeltaBase.insert_sequential_module) interface. + - **parallel insertion** + + Adapters can also be used in a parallel fashion (see [Paper](https://arxiv.org/abs/2110.04366)). + For these methods, use [insert_parallel_module](opendelta.basemodel.DeltaBase.insert_parrellel_module) interface. + + +:::{admonition} Doc-preserving Insertion +:class: note +In the insertion operations, the replaced forward function will inherit the doc strings of the original functions. +::: + +## 3. Pseudo input to initialize. +Some delta models, especially the ones that is newly introduced into the backbone, will need to determine the parameters' shape. To get the shape, we pass a pseudo input to the backbone model and determine the shape of each delta layer according to the need of smooth tensor flow. + +:::{admonition} Pseudo Input +:class: warning +Most models in [Huggingface Transformers](https://huggingface.co/docs/transformers/index) have an attribute [dummy_inputs](https://github.com/huggingface/transformers/blob/v4.16.2/src/transformers/modeling_utils.py#L464). This will create a nonsensical input with the correct format to pass into the model's forward function. + +For the models that doesn't inherit/implement this attributes, we assume the pseudo input to the model is something like `input_id`, i.e., an integer tensor. +```python +pseudo_input = torch.tensor([[0,0,0]]) +# or +pseudo_input = torch.tensor([0,0,0]) +``` + We will add interface to allow more pseudo input in the future. +::: + + + + + diff --git a/docs/source/notes/knownissue.md b/docs/source/notes/knownissue.md new file mode 100644 index 0000000..139597f --- /dev/null +++ b/docs/source/notes/knownissue.md @@ -0,0 +1,2 @@ + + diff --git a/docs/source/notes/overview.md b/docs/source/notes/overview.md new file mode 100644 index 0000000..ccec827 --- /dev/null +++ b/docs/source/notes/overview.md @@ -0,0 +1,36 @@ +# What is Delta-tuning and Why OpenDelta? + +(WhatisDelta)= +:::{admonition} What is Delta? +:class: tip + +As Pre-trained language models (PLMs) have become the fundamental infrastructure on many NLP tasks and benchmarks, it is becoming increasingly clear from recent research that **larger models tend to lead to better performance**. However, large-scale PLMs also bring prohibitive adaptation costs when fine-tuning all the parameters of a model and retaining separate instances for different tasks. + +**Parameter-efficient model stimulation methods** thus have attracted researchers' eyes, which only tune a small fraction of model parameter while achieving comparable or even better performance than full-model fine-tuning, dubbed as "Delta-tuning". + +**Delta** thus means a small fraction $\Delta\Theta$ of parameters besides the pretrained models $\Theta_0$. + +\begin{gather*} +\Theta \sim \Theta_0\text{(frozen)} + \Delta\Theta\text{(tunable)} +\end{gather*} + +This open-source project implement several delta-tuning methods, which allows researchers and engineers to quickly migrate their codes from full-model tuning to delta-tuning without replace the backend (the implementation of the backbone PLM). +::: + + + +## Why OpenDelta? + +- Clean: No need to edit the backbone PTM’s codes. +- Simple: Migrating from full-model tuning to delta-tuning needs as little as 3 lines of codes. +- Sustainable: Most evolution in external library doesn’t require a new OpenDelta. +- Extendable: Various PTMs can share the same delta-tuning codes. +- Flexible: Able to apply delta-tuning to (almost) any position of the PTMs. + + +## Delta-tuning papers + + + + + diff --git a/docs/source/notes/pluginunplug.md b/docs/source/notes/pluginunplug.md new file mode 100644 index 0000000..eeadd57 --- /dev/null +++ b/docs/source/notes/pluginunplug.md @@ -0,0 +1,113 @@ +# Multitask Modeling using OpenDelta + +:::{admonition} Multitask Serving with Delta-tuning +:class: tip +A huge advange of Delta-tuning is that it can be used for multitask serving. +Imagine we have a pretrained model trained on a mix of data coming from multiple languages, e.g.,English, Chinese, and French. Now you want to have seperate models that specialise in Chinese, French, English. We can thus delta-tune three deltas on each language with small amount of additional language-specific data. During serving, when a Chinese sentence comes, you attach the "Chinese Delta", and next a French sentence comes, you detach the "Chinese Delta", and attach a "French Delta". +::: + +**Here is how to achieve multitask serving using OpenDelta.** + +```python +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base") +from opendelta import LoraModel +delta_model = LoraModel(backbone_model=model, modified_modules=['fc2']) +delta_model.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/plugunplug1.png +--- +width: 800px +name: defaultmodification +--- +``` +```` + +Now we detach the deltas from the backbone +```python +delta_model.detach() +delta_model.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/plugunplug2.png +--- +width: 800px +name: defaultmodification +--- +``` +```` + +We can reattach the deltas to the backbone +```python +delta_model.attach() +delta_model.log() +``` + +````{collapse} Click to view output +```{figure} ../imgs/plugunplug3.png +--- +width: 800px +name: defaultmodification +--- +``` +```` + +:::{admonition} Independence of Different Delta Models +:class: note +Different delta models will be independent in detaching and attaching. +(But the visualization will not show all deltas in the backbone model.) +```python +# continue from the above example +from opendelta import AdapterModel +delta_model2 = AdapterModel(backbone_model=model, modified_modules=['fc1']) +delta_model2.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/plugunplug4.png +--- +width: 800px +name: defaultmodification +--- +``` +```` + +detach the lora delta +```python +delta_model.detach() # detach the lora delta +delta_model.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/plugunplug5.png +--- +width: 800px +name: defaultmodification +--- +``` +```` + +detach the adapter delta and reattach the lora delta +```python +delta_model2.detach() # detach the adapter delta +delta_model.attach() # reattach the lora delta +delta_model.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/plugunplug6.png +--- +width: 800px +name: defaultmodification +--- +``` +```` +::: + + +:::{admonition} BitFit not supported +:class: warning + Currently detach is not suitable for BitFit, which modify the requires_grad property. Please wait for future releases. +::: + + + + diff --git a/docs/source/notes/saveload.md b/docs/source/notes/saveload.md new file mode 100644 index 0000000..ecddd23 --- /dev/null +++ b/docs/source/notes/saveload.md @@ -0,0 +1,98 @@ +(saveload)= +# Save and Share the Delta + +## Space efficient saving without changing the code. +After a modified backbone model is trained, you can save only trained part without change to any code, because **the state dict of the backbone model has been changed to the trainable parts** + +```python +from opendelta import CompacterModel +from transformers import BertForMaskedLM +backbone_model = BertForMaskedLM.from_pretrained("bert-base-uncased") +delta_model = CompacterModel(backbone_model) # modify the default modules. + +# freeze module +delta_model.freeze_module(exclude=["deltas"], set_state_dict=True) +# or +delta_model.freeze_module(exclude=["deltas"]) +``` +### save the checkpoint. +now save the backbone_model in normal way, and the checkpoint is **very space efficient**. + +```python +# ... +# After some training pipeline +# ... +torch.save(backbone_model.state_dict(), "delta.ckpt") + +# the checkpoint size +import os +print("checkpoint size: {:.2f}M".format(os.path.getsize("delta.ckpt")/1024**2)) +# checkpoint size: 0.32M +``` + +### load the checkpoint. +In order to load the checkpoint, you should make sure the backbone model is a modified ones (so that it can take in the delta parameters). +Then load the checkpoint with `strict=False`. +```python +backbone_model.load_state_dict(torch.load("delta.ckpt"), strict=False) +# this will return long string of warning about the 'missing key'. +# if you want to supress it, use +# _ = backbone_model.load_state_dict(torch.load("delta.ckpt"), strict=False) +``` + +## Save/Load the entire model after training. + +### save a delta model. +```python +delta_model.save_finetuned("delta_model") +# Configuration saved in delta_model/config.json +# Model weights saved in delta_model/pytorch_model.bin +``` +This will save all the trained parameters and the configuration of the delta model to path `delta_model/` + +### load a delta model. + +```python +backbone_model = BertForMaskedLM.from_pretrained("bert-base-uncased") +delta_model.from_finetuned("delta_model", backbone_model, local_files_only=True) +# passing local_files_only=True will save the time of checking in the web. +``` + +## Share or download a model to/from the community. + +### Share. +```python +delta_model.save_finetuned("test_delta_model", push_to_hub = True) +``` + +### Download from community. +```python +from transformers import AutoModelForSeq2SeqLM +t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-base") +from opendelta import AutoDeltaModel +delta = AutoDeltaModel.from_finetuned("DeltaHub/lora_t5-base_mrpc", backbone_model=t5) +delta.log() +``` + +
+

**Push to Hub**

+

Currently we only provide the option to push to huggingface model hub.

+

Before push to hub, you may need to register an account on Huggingface. You can refer to this [tutorial about model sharing and uploading](https://huggingface.co/docs/transformers/model_sharing) +

+

In some cases, your checkpoint is still large for git, please install [`git-lfs`](https://git-lfs.github.com). +

+
+ +:::{admonition} **Sharing with the Community** +:class: tip +If you are satisfied with your checkpoint, do not forget to share your model to DeltaHub: +1. Add yourself to DeltaHub with the [public link](https://huggingface.co/organizations/DeltaHub/share/QzkBuLSmlVnNhQqHYnekoTXwSRkoRHBwZA) +2. Be sure to edit your model card to clearly illustrate the delta model before you share. +3. Click `setting` on the model +4. Transfer the model in `rename or transfer this model` section. +::: + + +## Save & Load for Composition of Delta + + Currently save & load method is not suitable for [composition of delta model](compositon). Please wait for future releases. \ No newline at end of file diff --git a/docs/source/notes/unifyname.md b/docs/source/notes/unifyname.md new file mode 100644 index 0000000..0b3f485 --- /dev/null +++ b/docs/source/notes/unifyname.md @@ -0,0 +1,82 @@ +(unifyname)= + +# Unified Name Convention + +```{figure} ../imgs/transformers_structure.png +:width: 400px +:name: transformers_structure +``` + +Although different PTMs often share similar Transformers structures, the codebases, and most importantly, the variable names for each submodule, are quite different. + + + +On the one hand, we **encourage the users to first [visualize](visualization) the PTMs' structure and then determine the name of submoduels.** + +On the other hand, we designed a unified name convention of Transformer Structure, and provided several structure mapping from the original name to the unified name convention. + +In this section, we will illustrate the unified name convention and structure mapping. + + +## Common blocks in Transformers structure. + + +- embeddings (word embedding) +- encoder + - block + - $ (layer_id) + - attn + - q, k, v + - proj + - layer_norm + - ff + - w1 + - w2 + - layer_norm +- decoder (similar to encoder) +- lm_head + - proj + +Visualize bert-base using a common structure name: The submodules that are not common are grey. + +```{figure} ../imgs/commonstructure_vis.png +:width: 600px +:name: transformers_structure +``` + +(commonstructure)= +## Mappings + +Example of bert mapping: a tree with node names specified by "\_\_name\_\_" +```json +{ + "bert.embeddings.word_embeddings": {"__name__":"embeddings"}, + "bert.embeddings.position_embeddings": {"__name__":""}, + "bert.embeddings.token_type_embeddings": {"__name__":""}, + "bert.embeddings.LayerNorm": {"__name__":""}, + "bert.encoder": {"__name__":"encoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "attention": {"__name__":"attn", + "self.query": {"__name__":"q"}, + "self.key": {"__name__":"k"}, + "self.value": {"__name__":"v"}, + "output.dense": {"__name__":"proj"}, + "output.LayerNorm": {"__name__":"layer_norm"}, + }, + "output": {"__name__":"ff", + "dense": {"__name__":"w2"}, + "LayerNorm": {"__name__":"layer_norm"} + }, + "intermediate.dense": {"__name__":"ff.w1"}, + } + } + }, + "cls.predictions": {"__name__": "lm_head", + "transform.dense": {"__name__":""}, + "transform.LayerNorm": {"__name__":""}, + "decoder": {"__name__":"proj"}, + } +} +``` + diff --git a/docs/source/notes/usage.md b/docs/source/notes/usage.md new file mode 100644 index 0000000..4ddcf94 --- /dev/null +++ b/docs/source/notes/usage.md @@ -0,0 +1,137 @@ +(basics)= +# Basic Usage +Now we introduce the general pipeline to migrate your full-model tuning scripts to a delta tuning one. + +## STEP 1: Load the pretrained models + +```python +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base") # suppose we load BART +``` + +## STEP 2: Add delta modules +We provide two alternatives to add the delta modules. +### 2.1 Modification based on visualization +Suppose we want to make the feedforward layer of each block as our [modification target module](target_module), +We should first know what is the name of the feedforward layer in the BART model by visualization. *For more about visualization, see [Visualization](visualization).* + +```python +from opendelta import Visualization +Visualization(model).structure_graph() +``` + +````{collapse} Click to view output +```{figure} ../imgs/bart-base.png +--- +width: 600px +name: bart-base +--- +``` +```` + + + + +We can see from the structure graph that the feed forward layer in Bart is called `model.encoder.layers.$.fc1` and `model.encoder.layers.$.fc2`, where +`$` represent a number from 0-5. Since we want to apply adapter after *all* the feed forward layers, we specify the `modified_modules=['fc2']`, which is the common suffix for feed forward layers. + *For details about the name based addressing, see [Name-based submodule addressing](namebasedaddr)* + +Other configurations, such as the `bottleneck_dim` in Adapter, can be passed as key word arguments. +```python +from opendelta import AdapterModel +delta_model = AdapterModel(backbone_model=model, modified_modules=['fc2'], bottleneck_dim=12) +delta_model.log() # This will visualize the backbone after modification and other information. +``` + +(target_module)= +:::{admonition} Target module +:class: note +For different delta methods, the operation for the modification target is different. +- Adapter based method: Insert at the target module's forward function. +- BitFit: Add bias to all allowed position of the target module. +- Lora: Substitute the all the linear layers of the target module with [Lora.Linear](https://github.com/microsoft/LoRA/blob/main/loralib/layers.py#L92). +::: + +### 2.2 Use the default modification. +We also provide the default modifications of each delta methods for some commonly used PTMs (e.g., BERT, RoBERTA, DistilBERT, T5, GPT2), so the users don't need to specify the submodules to modify. + +The default modifications is achieved by a [common_structure mapping](commonstructure), that is, use the mapping a name of a module to the it's name on a common transformer structure. *For details about the default modification, see [Unified Name Convention](unifyname)* + + + +```python +# a seperate example using BERT. +from transformers import BertForMaskedLM +from opendelta import AdapterModel +model = BertForMaskedLM.from_pretrained("bert-base-cased") +delta_model = AdapterModel(model) # This will apply adapter to the self-attn and feed-forward layer. +delta_model.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/defaultmodification.png +--- +width: 600px +name: defaultmodification +--- +``` +```` + + + + +:::{admonition} Delta model vs Backbone model +:class: note +The delta_model **CAN NOT** be used alone, and its [forward](opendelta.basemodel.DeltaBase.forward) is canceled. The training pipeline should be conducted on the backbone model (In the above example, its the `model`). +::: + +:::{admonition} Try different positions +:class: tip +OpenDelta provide the flexibility to add delta to different positions on the backbone model. For example, If you want to move the adapter in the above example after the layer norm of the feed forward layer. The code should be changed into +```python +# continue with the BART example, but not used later. +delta_model = AdapterModel(backbone_model=model, modified_modules=['final_layer_norm'], bottleneck_dim=12) +``` +The performance may vary due to positional differences, but there is no academic guarantee that one will outperform the other. +::: + + +:::{admonition} Favored Configurations +:class: tip +Feel confused about the flexibility that OpenDelta brings? NO WORRY! We will add [Favored Configurations](favoredconfiguration) soon. +::: + +## STEP 3: Freezing parameters +The main part of the backbone model is not automatically frozen (We may add the option in future). To freeze the main part of the backbone model except the trainable parts (usually the delta paramters), use [freeze_module](opendelta.basemodel.DeltaBase.freeze_module) method. The `exclude` field obeys the same name-based addressing rules as the `modified_modules` field. + +```python +# continue with the BART example +delta_model.freeze_module(exclude=["deltas", "layernorm_embedding"], set_state_dict=True) +delta_model.log() +``` +````{collapse} Click to view output +```{figure} ../imgs/afterfreeze.png +--- +width: 600px +name: afterfreeze +--- +``` +```` +The `set_state_dict=True` will tell the method to change the `state_dict` of the `backbone_model` to maintaining only the trainable parts. + + +## STEP 4: Normal training pipeline + +The **model** then can be trained in traditional training scripts. Two things should be noticed: + +:::{admonition} Note +:class: note +1. No need to change the optimizer, since the optimizer will only calculated and store gradient for those parameters with `requires_grad=True`, and the `requires_grad` attribute has been changed during the call to [freeze_module](opendelta.basemodel.DeltaBase.freeze_module) method. +2. `model.eval()` or `model.train()` should be used when needed to set dropout, etc. Delta model doesn't touch those configuration. +::: +## STEP 5: Saved/Share the Delta Model + + *see [Save a delta model to local, or share with the community](saveload).* + + + + diff --git a/docs/source/notes/visualization.md b/docs/source/notes/visualization.md new file mode 100644 index 0000000..d873d8e --- /dev/null +++ b/docs/source/notes/visualization.md @@ -0,0 +1,125 @@ +(visualization)= +# Visualize the Parameters + +When OpenDelta makes modifications to a pretrained model (PTM), it is beneficial to know what your PTM looks like, especially the location of the parameters. + +- **Before** applying opendelta, you can know **how to specify your modifications in terms of key addressing**. +- **After** the modification is done, you can know **if your modification is what you expected**, for example, whether the position of the delta +modules are desired, or whether you froze the correct parameters. + +Now let's begin to try the visualization utility. + +## Visualization is NOT easy using pytorch native function. + +```python +from transformers import BertForMaskedLM +backbone_model = BertForMaskedLM.from_pretrained("bert-base-uncased") +print(backbone_model) +``` + +````{collapse} Click to view output +```{figure} ../imgs/raw_print.png +--- +width: 600px +name: raw_print +--- +``` +```` + +The original presentation of models is **not tailored for repeated structures, big models, or parameters-centric tasks**. + + +## Using visualization from opendelta. + +First let's visualize all the parameters in the bert model. As we can see, structure inside a bert model, and the all the paramters location of the model are neatly represented in tree structure. (See [color scheme](color_schema) for the colors) + +```python +from opendelta import Visualization +model_vis = Visualization(backbone_model) +model_vis.structure_graph() +``` + + +```{figure} ../imgs/bert_vis.png +--- +width: 600px +name: bert_vis +--- +``` + + + +
+

**Suggestion**

+We can reference a module according to the graph easily: +```python +print(backbone_model.bert.encoder.layer[0].intermdiate) +``` +When using opendelta on a new backbone model, it's better to first visualize the child module names (shown in white), and then designating the `modified_modules`. +
+ + + + +## Now add a delta model and visualize the change. + + +```python +from opendelta import LowRankAdapterModel +delta_model = LowRankAdapterModel(backbone_model) +delta_model.freeze_module(exclude=["cls", "intermediate", "LayerNorm"]) +Visualization(backbone_model).structure_graph() +``` + +````{collapse} Click to view output +```{figure} ../imgs/bertdelta_vis.png +--- +width: 600px +name: bertdelta_vis +--- +``` +```` + +(color_schema)= +
+
**Color Schema**
+
    +
  • The white part is the name of the module.
  • +
  • The green part is the module's type.
  • +
  • The blue part is the tunable parameters, i.e., the parameters that require grad computation.
  • +
  • The grey part is the frozen parameters, i.e., the parameters that do not require grad computation.
  • +
  • The red part is the structure that is repeated and thus folded.
  • +
  • The purple part is the delta parameters inserted into the backbone model.
  • +
+
+ +:::{admonition} PlatForm Sentivity +:class: warning +Depending on the platform the code is running on, the colors may vary slightly. +::: + + + + +## We also provide the option to visualize the nodes without parameters. + +```python +Visualization(backbone_model).structure_graph(keep_non_params=True) +``` + +Thus, the modules like dropout and activations are kept. + + +````{collapse} Click to view output +```{figure} ../imgs/bertdelta_noparam.png +--- +width: 600px +name: bertdelta_noparam +--- +``` +```` + +:::{admonition} Order of the submodule +:class: warning +Currently, OpenDelta‘s Visualization visualize the model based on pytorch's named_modules method. That means the order of the presented submodule is the order they are add to the parent module, not necessarily the order that tensors flows through. +::: \ No newline at end of file diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..1d4da65 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,25 @@ +# Use Examples + +This repo mainly contains several running scripts to use OpenDelta to conduct parameter-efficient training of various tasks. + +**Note that we suggest adding OpenDelta to existing scripts, instead of modify a scripts into the following examples. OpenDelta itself doens't restrict the training pipeline nor provide pipeline.** + + +## tutorial +Several toy tutorials: +1. The scripts for docs/basic_usage +2. Using interactive module selection +3. Work with [OpenPrompt](https://github.com/thunlp/OpenPrompt) + +## examples_text-classification +Modify a huggingface text-classification examples into a delta tuning one. +Currently, GLUE datasets are supported in the scripts. Roberta-base is used for performance checking. Read README.md inside the repo for detailed usage. + +## examples_seq2seq +Modify a huggingface sequence to sequence examples into a delta tuning one. +Currently, SuperGLUE and GLUE datasets are supported in the scripts. T5-base is used for performance checking. Read README.md inside the repo for detailed usage. + + +## examples_image-classification +A toy example of using OpenDelta for a Computer Vision Pretrained Model (ViT). Since ViT is an experimental feature in huggingface transformers, this example is subject to Change at any moment. + diff --git a/examples/examples_image-classification/README.md b/examples/examples_image-classification/README.md new file mode 100644 index 0000000..08e51bf --- /dev/null +++ b/examples/examples_image-classification/README.md @@ -0,0 +1,166 @@ + + +# Use OpenDelta in vision transformer ViT + +This example uses the [huggingface image classification examples](), by adding several +lines in the original scripts. + +## Usage +### 1. install necessary package +```shell +pip install Pillow +pip install torchvision +pip install transformers==4.16.2 +pip install datsets==1.18.0 +``` + +### 2. upgrade the transformers to 4.10.0 + +### 3. run +```bash +python run_image_classification.py configs/lora_beans.json +``` + +Do not forget to re-install datasets back into 1.17.0 for other examples. :) + + +## Possible Errors +1. dataset connection error + +Solution 1: open a python console, running the error command again, may not be useful + +Solution 2: download the dataset by yourself on a internect connected machine, saved to disk and transfer to your server, at last load_from_disk. + + + +# Image classification examples + +The following examples showcase how to fine-tune a `ViT` for image-classification using PyTorch. + +## Using datasets from 🤗 `datasets` + +Here we show how to fine-tune a `ViT` on the [beans](https://huggingface.co/datasets/beans) dataset. + +👀 See the results here: [nateraw/vit-base-beans](https://huggingface.co/nateraw/vit-base-beans). + +```bash +python run_image_classification.py \ + --dataset_name beans \ + --output_dir ./beans_outputs/ \ + --remove_unused_columns False \ + --do_train \ + --do_eval \ + --push_to_hub \ + --push_to_hub_model_id vit-base-beans \ + --learning_rate 2e-5 \ + --num_train_epochs 5 \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 8 \ + --logging_strategy steps \ + --logging_steps 10 \ + --evaluation_strategy epoch \ + --save_strategy epoch \ + --load_best_model_at_end True \ + --save_total_limit 3 \ + --seed 1337 +``` + +Here we show how to fine-tune a `ViT` on the [cats_vs_dogs](https://huggingface.co/datasets/cats_vs_dogs) dataset. + +👀 See the results here: [nateraw/vit-base-cats-vs-dogs](https://huggingface.co/nateraw/vit-base-cats-vs-dogs). + +```bash +python run_image_classification.py \ + --dataset_name cats_vs_dogs \ + --output_dir ./cats_vs_dogs_outputs/ \ + --remove_unused_columns False \ + --do_train \ + --do_eval \ + --push_to_hub \ + --push_to_hub_model_id vit-base-cats-vs-dogs \ + --fp16 True \ + --learning_rate 2e-4 \ + --num_train_epochs 5 \ + --per_device_train_batch_size 32 \ + --per_device_eval_batch_size 32 \ + --logging_strategy steps \ + --logging_steps 10 \ + --evaluation_strategy epoch \ + --save_strategy epoch \ + --load_best_model_at_end True \ + --save_total_limit 3 \ + --seed 1337 +``` + +## Using your own data + +To use your own dataset, the training script expects the following directory structure: + +```bash +root/dog/xxx.png +root/dog/xxy.png +root/dog/[...]/xxz.png + +root/cat/123.png +root/cat/nsdf3.png +root/cat/[...]/asd932_.png +``` + +Once you've prepared your dataset, you can can run the script like this: + +```bash +python run_image_classification.py \ + --dataset_name nateraw/image-folder \ + --train_dir \ + --output_dir ./outputs/ \ + --remove_unused_columns False \ + --do_train \ + --do_eval +``` + +### 💡 The above will split the train dir into training and evaluation sets + - To control the split amount, use the `--train_val_split` flag. + - To provide your own validation split in its own directory, you can pass the `--validation_dir ` flag. + + +## Sharing your model on 🤗 Hub + +0. If you haven't already, [sign up](https://huggingface.co/join) for a 🤗 account + +1. Make sure you have `git-lfs` installed and git set up. + +```bash +$ apt install git-lfs +$ git config --global user.email "you@example.com" +$ git config --global user.name "Your Name" +``` + +2. Log in with your HuggingFace account credentials using `huggingface-cli` + +```bash +$ huggingface-cli login +# ...follow the prompts +``` + +3. When running the script, pass the following arguments: + +```bash +python run_image_classification.py \ + --push_to_hub \ + --push_to_hub_model_id \ + ... +``` \ No newline at end of file diff --git a/examples/examples_image-classification/configs/lora_beans.json b/examples/examples_image-classification/configs/lora_beans.json new file mode 100644 index 0000000..c39c522 --- /dev/null +++ b/examples/examples_image-classification/configs/lora_beans.json @@ -0,0 +1,30 @@ +{ + "report_to": "none", + "dataset_name": "beans", + "output_dir": "./beans_outputs/", + "do_train": true, + "do_eval": true, + "num_train_epochs": 5, + "remove_unused_columns": false, + "per_device_train_batch_size": 8, + "per_device_eval_batch_size": 8, + "logging_strategy": "steps", + "logging_steps": 10, + "evaluation_strategy": "epoch", + "save_strategy": "epoch", + "load_best_model_at_end": true, + "save_total_limit": 3, + "seed": 1337, + "delta_type": "lora", + "modified_modules": [ + "attention.query", + "attention.value" + ], + "unfrozen_modules": [ + "classifier", + "deltas" + ], + "overwrite_output_dir": true, + "learning_rate": 5e-4 + +} \ No newline at end of file diff --git a/examples/examples_image-classification/metric.py b/examples/examples_image-classification/metric.py new file mode 100644 index 0000000..798c111 --- /dev/null +++ b/examples/examples_image-classification/metric.py @@ -0,0 +1,89 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Accuracy metric.""" + +from sklearn.metrics import accuracy_score + +import datasets + + +_DESCRIPTION = """ +Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with: +Accuracy = (TP + TN) / (TP + TN + FP + FN) +TP: True positive +TN: True negative +FP: False positive +FN: False negative +""" + +_KWARGS_DESCRIPTION = """ +Args: + predictions: Predicted labels, as returned by a model. + references: Ground truth labels. + normalize: If False, return the number of correctly classified samples. + Otherwise, return the fraction of correctly classified samples. + sample_weight: Sample weights. +Returns: + accuracy: Accuracy score. +Examples: + + >>> accuracy_metric = datasets.load_metric("accuracy") + >>> results = accuracy_metric.compute(references=[0, 1], predictions=[0, 1]) + >>> print(results) + {'accuracy': 1.0} +""" + +_CITATION = """\ +@article{scikit-learn, + title={Scikit-learn: Machine Learning in {P}ython}, + author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. + and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. + and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and + Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, + journal={Journal of Machine Learning Research}, + volume={12}, + pages={2825--2830}, + year={2011} +} +""" + + +@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class Accuracy(datasets.Metric): + def _info(self): + return datasets.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features( + { + "predictions": datasets.Sequence(datasets.Value("int32")), + "references": datasets.Sequence(datasets.Value("int32")), + } + if self.config_name == "multilabel" + else { + "predictions": datasets.Value("int32"), + "references": datasets.Value("int32"), + } + ), + reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"], + ) + + def _compute(self, predictions, references, normalize=True, sample_weight=None): + return { + "accuracy": float( + accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight) + ) + } \ No newline at end of file diff --git a/examples/examples_image-classification/requirements.txt b/examples/examples_image-classification/requirements.txt new file mode 100644 index 0000000..62bbb3c --- /dev/null +++ b/examples/examples_image-classification/requirements.txt @@ -0,0 +1,3 @@ +# torch>=1.5.0 +torchvision>=0.6.0 +datasets>=1.8.0 \ No newline at end of file diff --git a/examples/examples_image-classification/run_image_classification.py b/examples/examples_image-classification/run_image_classification.py new file mode 100644 index 0000000..cc351cf --- /dev/null +++ b/examples/examples_image-classification/run_image_classification.py @@ -0,0 +1,392 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +import logging +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +import datasets +import numpy as np +import torch +from datasets import load_dataset +from PIL import Image +from torchvision.transforms import ( + CenterCrop, + Compose, + Normalize, + RandomHorizontalFlip, + RandomResizedCrop, + Resize, + ToTensor, +) + +import transformers +from transformers import ( + MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, + AutoConfig, + AutoFeatureExtractor, + AutoModelForImageClassification, + HfArgumentParser, + Trainer, + TrainingArguments, +) +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from transformers.utils.versions import require_version + + +""" Fine-tuning a 🤗 Transformers model for image classification""" + +logger = logging.getLogger(__name__) + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.16.0.dev0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +def pil_loader(path: str): + with open(path, "rb") as f: + im = Image.open(f) + return im.convert("RGB") + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + Using ``HfArgumentParser`` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + dataset_name: Optional[str] = field( + default="nateraw/image-folder", metadata={"help": "Name of a dataset from the datasets package"} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the training data."}) + validation_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the validation data."}) + train_val_split: Optional[float] = field( + default=0.15, metadata={"help": "Percent to split off of train for validation."} + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + + def __post_init__(self): + data_files = dict() + if self.train_dir is not None: + data_files["train"] = self.train_dir + if self.validation_dir is not None: + data_files["val"] = self.validation_dir + self.data_files = data_files if data_files else None + +class RemainArgHfArgumentParser(HfArgumentParser): + def parse_json_file(self, json_file: str, return_remaining_args=True ): + """ + Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the + dataclass types. + """ + import argparse + import json + from pathlib import Path + import dataclasses + + data = json.loads(Path(json_file).read_text()) + outputs = [] + for dtype in self.dataclass_types: + keys = {f.name for f in dataclasses.fields(dtype) if f.init} + inputs = {k: data.pop(k) for k in list(data.keys()) if k in keys} + obj = dtype(**inputs) + outputs.append(obj) + + remain_args = argparse.ArgumentParser() + remain_args.__dict__.update(data) + if return_remaining_args: + return (*outputs, remain_args) + else: + return (*outputs,) + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + default="google/vit-base-patch16-224-in21k", + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."}) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + +def collate_fn(examples): + pixel_values = torch.stack([example["pixel_values"] for example in examples]) + labels = torch.tensor([example["labels"] for example in examples]) + return {"pixel_values": pixel_values, "labels": labels} + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args, delta_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args, delta_args = parser.parse_args_into_dataclasses() + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Initialize our dataset and prepare it for the 'image-classification' task. + ds = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + data_files=data_args.data_files, + cache_dir=model_args.cache_dir, + task="image-classification", + ) + # If you encounter error here, try to down load the dataset by yourself and load from disk + # like the following two lines + # from datasets import load_from_disk + # ds = load_from_disk(f"../../../../huggingface_datasets/saved_to_disk/{data_args.dataset_name}") + + # If we don't have a validation split, split off a percentage of train as validation. + data_args.train_val_split = None if "validation" in ds.keys() else data_args.train_val_split + if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0: + split = ds["train"].train_test_split(data_args.train_val_split) + ds["train"] = split["train"] + ds["validation"] = split["test"] + + # Prepare label mappings. + # We'll include these in the model's config to get human readable labels in the Inference API. + labels = ds["train"].features["labels"].names + label2id, id2label = dict(), dict() + for i, label in enumerate(labels): + label2id[label] = str(i) + id2label[str(i)] = label + + # Load the accuracy metric from the datasets package + # metric = datasets.load_metric("accuracy") + metric = datasets.load_metric("metric.py") + + # Define our compute_metrics function. It takes an ``EvalPrediction`` object (a namedtuple with a + # predictions and label_ids field) and has to return a dictionary string to float. + def compute_metrics(p): + """Computes accuracy on a batch of predictions""" + return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids) + + config = AutoConfig.from_pretrained( + model_args.config_name or model_args.model_name_or_path, + num_labels=len(labels), + label2id=label2id, + id2label=id2label, + finetuning_task="image-classification", + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + model = AutoModelForImageClassification.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + feature_extractor = AutoFeatureExtractor.from_pretrained( + model_args.feature_extractor_name or model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + + if delta_args.delta_type.lower() != "none": + from opendelta import AutoDeltaConfig,AutoDeltaModel + delta_config = AutoDeltaConfig.from_dict(vars(delta_args)) + delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=model) + delta_model.freeze_module(set_state_dict = True) + delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True) + + # Define torchvision transforms to be applied to each image. + normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std) + _train_transforms = Compose( + [ + RandomResizedCrop(feature_extractor.size), + RandomHorizontalFlip(), + ToTensor(), + normalize, + ] + ) + _val_transforms = Compose( + [ + Resize(feature_extractor.size), + CenterCrop(feature_extractor.size), + ToTensor(), + normalize, + ] + ) + + def train_transforms(example_batch): + """Apply _train_transforms across a batch.""" + example_batch["pixel_values"] = [ + _train_transforms(pil_img.convert("RGB")) for pil_img in example_batch["image"] + ] + return example_batch + + def val_transforms(example_batch): + """Apply _val_transforms across a batch.""" + example_batch["pixel_values"] = [_val_transforms(pil_img.convert("RGB")) for pil_img in example_batch["image"]] + return example_batch + + if training_args.do_train: + if "train" not in ds: + raise ValueError("--do_train requires a train dataset") + if data_args.max_train_samples is not None: + ds["train"] = ds["train"].shuffle(seed=training_args.seed).select(range(data_args.max_train_samples)) + # Set the training transforms + ds["train"].set_transform(train_transforms) + + if training_args.do_eval: + if "validation" not in ds: + raise ValueError("--do_eval requires a validation dataset") + if data_args.max_eval_samples is not None: + ds["validation"] = ( + ds["validation"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples)) + ) + # Set the validation transforms + ds["validation"].set_transform(val_transforms) + + # Initalize our trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=ds["train"] if training_args.do_train else None, + eval_dataset=ds["validation"] if training_args.do_eval else None, + compute_metrics=compute_metrics, + tokenizer=feature_extractor, + data_collator=collate_fn, + ) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() + trainer.log_metrics("train", train_result.metrics) + trainer.save_metrics("train", train_result.metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + metrics = trainer.evaluate() + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + # Write model card and (optionally) push to hub + kwargs = { + "finetuned_from": model_args.model_name_or_path, + "tasks": "image-classification", + "dataset": data_args.dataset_name, + "tags": ["image-classification"], + } + if training_args.push_to_hub: + trainer.push_to_hub(**kwargs) + else: + trainer.create_model_card(**kwargs) + + +if __name__ == "__main__": + main() diff --git a/examples/examples_seq2seq/README.md b/examples/examples_seq2seq/README.md new file mode 100644 index 0000000..38c5b22 --- /dev/null +++ b/examples/examples_seq2seq/README.md @@ -0,0 +1,64 @@ +# Appling OpenDelta to GLUE/SuperGLUE tasks using Seq2Seq Paradigm + + +## install the repo +```bash +cd ../ +python setup_seq2seq.py develop +``` +This will add `examples_seq2seq` to the environment path of the python lib. + +## Generating the json configuration file + +``` +python config_gen.py --job $job_name + +``` +The available job configuration (e.g., `--job lora_t5-base`) can be seen from `config_gen.py`. You can also +create your only configuration. + + +## Run the code + +``` +python run_seq2seq.py configs/$job_name/$dataset.json +``` + +## Possible Errors + +1. +``` +ValueError: You must login to the Hugging Face hub on this computer by typing `transformers-cli login` and entering your credentials to use `use_auth_token=Tr +ue`. Alternatively, you can pass your own token as the `use_auth_token` argument. +``` +- Solution 1: Please register an account on [HuggingFace](https://huggingface.co/) +Then run transformers-cli login on your command line to enter the username and password. + +- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False + +2. +``` +OSError: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once). +``` + +- Solution 1: +``` +wget -P ~ https://github.com/git-lfs/git-lfs/releases/download/v3.0.2/git-lfs-linux-amd64-v3.0.2.tar.gz +cd ~ +tar -xvzf git-lfs-linux-amd64-v3.0.2.tar.gz +export PATH=~:$PATH +git-lfs install +``` + +- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False + + +3. dataset connection error + +Solution 1: open a python console, running the error command again, may not be useful + +Solution 2: download the dataset by yourself on a internect connected machine, saved to disk and transfer to your server, at last load_from_disk. + + +## Link to the original training scripts +This example repo is based on the [compacter training scripts](https://github.com/rabeehk/compacter), with compacter-related lines removed. Thanks to the authors of the original repo. In addition, in private correspondence with the authors, they shared the codes to create the json configs. Thanks again for their efforts. diff --git a/examples/examples_seq2seq/__init__.py b/examples/examples_seq2seq/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/examples_seq2seq/collect_result.jsonl b/examples/examples_seq2seq/collect_result.jsonl new file mode 100644 index 0000000..3ab2a93 --- /dev/null +++ b/examples/examples_seq2seq/collect_result.jsonl @@ -0,0 +1,21 @@ +# the final results will be populated here.{ + "evaluate": { + "epoch": 20.0, + "eval_accuracy": 89.2156862745098, + "eval_average_metrics": 90.76168929110105, + "eval_f1": 92.3076923076923, + "eval_loss": 0.16493959724903107, + "eval_runtime": 1.6391, + "eval_samples_per_second": 124.455 + }, + "repo_name": "DeltaHub/bitfit_t5-base_mrpc", + "test": { + "epoch": 20.0, + "test_accuracy": 88.23529411764706, + "test_average_metrics": 89.97971602434077, + "test_f1": 91.72413793103448, + "test_loss": 0.14968213438987732, + "test_runtime": 1.6344, + "test_samples_per_second": 124.82 + } +} diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/cola.json b/examples/examples_seq2seq/configs/bitfit_t5-base/cola.json new file mode 100644 index 0000000..8ce796a --- /dev/null +++ b/examples/examples_seq2seq/configs/bitfit_t5-base/cola.json @@ -0,0 +1,40 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "cola", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "t5-base", + "num_train_epochs": 20, + "output_dir": "outputs/bitfit/t5-base/cola", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "cola", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "cola", + "tokenizer_name": "t5-base", + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/mnli.json b/examples/examples_seq2seq/configs/bitfit_t5-base/mnli.json new file mode 100644 index 0000000..20baa39 --- /dev/null +++ b/examples/examples_seq2seq/configs/bitfit_t5-base/mnli.json @@ -0,0 +1,40 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "t5-base", + "num_train_epochs": 3, + "output_dir": "outputs/bitfit/t5-base/mnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "mnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mnli", + "tokenizer_name": "t5-base", + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/mrpc.json b/examples/examples_seq2seq/configs/bitfit_t5-base/mrpc.json new file mode 100644 index 0000000..62ce057 --- /dev/null +++ b/examples/examples_seq2seq/configs/bitfit_t5-base/mrpc.json @@ -0,0 +1,40 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mrpc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "t5-base", + "num_train_epochs": 20, + "output_dir": "outputs/bitfit/t5-base/mrpc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "mrpc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mrpc", + "tokenizer_name": "t5-base", + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/qnli.json b/examples/examples_seq2seq/configs/bitfit_t5-base/qnli.json new file mode 100644 index 0000000..13775a2 --- /dev/null +++ b/examples/examples_seq2seq/configs/bitfit_t5-base/qnli.json @@ -0,0 +1,40 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "t5-base", + "num_train_epochs": 3, + "output_dir": "outputs/bitfit/t5-base/qnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "qnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qnli", + "tokenizer_name": "t5-base", + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/qqp.json b/examples/examples_seq2seq/configs/bitfit_t5-base/qqp.json new file mode 100644 index 0000000..ed48b69 --- /dev/null +++ b/examples/examples_seq2seq/configs/bitfit_t5-base/qqp.json @@ -0,0 +1,40 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qqp", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "t5-base", + "num_train_epochs": 3, + "output_dir": "outputs/bitfit/t5-base/qqp", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "qqp", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qqp", + "tokenizer_name": "t5-base", + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/rte.json b/examples/examples_seq2seq/configs/bitfit_t5-base/rte.json new file mode 100644 index 0000000..5f25109 --- /dev/null +++ b/examples/examples_seq2seq/configs/bitfit_t5-base/rte.json @@ -0,0 +1,40 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "t5-base", + "num_train_epochs": 20, + "output_dir": "outputs/bitfit/t5-base/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "t5-base", + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/sst2.json b/examples/examples_seq2seq/configs/bitfit_t5-base/sst2.json new file mode 100644 index 0000000..8319ed0 --- /dev/null +++ b/examples/examples_seq2seq/configs/bitfit_t5-base/sst2.json @@ -0,0 +1,40 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "sst2", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "t5-base", + "num_train_epochs": 3, + "output_dir": "outputs/bitfit/t5-base/sst2", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "sst2", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "sst2", + "tokenizer_name": "t5-base", + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/stsb.json b/examples/examples_seq2seq/configs/bitfit_t5-base/stsb.json new file mode 100644 index 0000000..f65a369 --- /dev/null +++ b/examples/examples_seq2seq/configs/bitfit_t5-base/stsb.json @@ -0,0 +1,40 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "stsb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "t5-base", + "num_train_epochs": 20, + "output_dir": "outputs/bitfit/t5-base/stsb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "stsb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "stsb", + "tokenizer_name": "t5-base", + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-boolq.json b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-boolq.json new file mode 100644 index 0000000..b733416 --- /dev/null +++ b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-boolq.json @@ -0,0 +1,40 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "t5-base", + "num_train_epochs": 20, + "output_dir": "outputs/bitfit/t5-base/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "t5-base", + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-cb.json b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-cb.json new file mode 100644 index 0000000..a801550 --- /dev/null +++ b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-cb.json @@ -0,0 +1,40 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-cb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "t5-base", + "num_train_epochs": 20, + "output_dir": "outputs/bitfit/t5-base/superglue-cb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "superglue-cb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-cb", + "tokenizer_name": "t5-base", + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-copa.json b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-copa.json new file mode 100644 index 0000000..c69b62d --- /dev/null +++ b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-copa.json @@ -0,0 +1,40 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-copa", + "eval_steps": 50, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "t5-base", + "num_train_epochs": 40, + "output_dir": "outputs/bitfit/t5-base/superglue-copa", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 50, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "superglue-copa", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-copa", + "tokenizer_name": "t5-base", + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-multirc.json b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-multirc.json new file mode 100644 index 0000000..fd694c2 --- /dev/null +++ b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-multirc.json @@ -0,0 +1,40 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-multirc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "t5-base", + "num_train_epochs": 3, + "output_dir": "outputs/bitfit/t5-base/superglue-multirc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "superglue-multirc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-multirc", + "tokenizer_name": "t5-base", + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-record.json b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-record.json new file mode 100644 index 0000000..b9f79c5 --- /dev/null +++ b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-record.json @@ -0,0 +1,40 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-record", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 512, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "t5-base", + "num_train_epochs": 3, + "output_dir": "outputs/bitfit/t5-base/superglue-record", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "superglue-record", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-record", + "tokenizer_name": "t5-base", + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-wic.json b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-wic.json new file mode 100644 index 0000000..900067f --- /dev/null +++ b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-wic.json @@ -0,0 +1,40 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wic", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "t5-base", + "num_train_epochs": 20, + "output_dir": "outputs/bitfit/t5-base/superglue-wic", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "superglue-wic", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wic", + "tokenizer_name": "t5-base", + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-wsc.fixed.json b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-wsc.fixed.json new file mode 100644 index 0000000..d6a7b64 --- /dev/null +++ b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-wsc.fixed.json @@ -0,0 +1,40 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wsc.fixed", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "t5-base", + "num_train_epochs": 20, + "output_dir": "outputs/bitfit/t5-base/superglue-wsc.fixed", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "superglue-wsc.fixed", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wsc.fixed", + "tokenizer_name": "t5-base", + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/config_gen.py b/examples/examples_seq2seq/configs/config_gen.py new file mode 100644 index 0000000..073a112 --- /dev/null +++ b/examples/examples_seq2seq/configs/config_gen.py @@ -0,0 +1,230 @@ +import collections +import copy + +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['t5-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "t5-base", + "tokenizer_name": "t5-base", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['bitfit_t5-base'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-base/", + }) + +AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['adapter_t5-base'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-base/", + }) + +AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['lora_t5-base'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-base/", + }) + +AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter++_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['low_rank_adapter_t5-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/t5-base/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['soft_prompt_t5-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/t5-base/", + }) + +AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['prefix_t5-base'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-base/", + }) + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"./{args.job}/"): + os.mkdir(f"./{args.job}/") + + for job_name in all_config_jsons: + with open(f"./{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + + \ No newline at end of file diff --git a/examples/examples_seq2seq/data_processors/__init__.py b/examples/examples_seq2seq/data_processors/__init__.py new file mode 100644 index 0000000..8b9d6bc --- /dev/null +++ b/examples/examples_seq2seq/data_processors/__init__.py @@ -0,0 +1,3 @@ +from .tasks import TASK_MAPPING, AutoTask +from .data_collator import TaskDataCollatorForSeq2Seq +from .postprocessors import AutoPostProcessor diff --git a/examples/examples_seq2seq/data_processors/data_collator.py b/examples/examples_seq2seq/data_processors/data_collator.py new file mode 100644 index 0000000..744a929 --- /dev/null +++ b/examples/examples_seq2seq/data_processors/data_collator.py @@ -0,0 +1,16 @@ +import numpy as np +from dataclasses import dataclass +from transformers import DataCollatorForSeq2Seq + + +@dataclass +class TaskDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): + def check_uniqueness(self, samples): + assert len(np.unique(samples)) == 1 + + def __call__(self, features): + # tasks = [d.pop('task') for d in features] + # self.check_uniqueness(tasks) + output = super().__call__(features) + # output["task"] = tasks[0] + return output \ No newline at end of file diff --git a/examples/examples_seq2seq/data_processors/postprocessors.py b/examples/examples_seq2seq/data_processors/postprocessors.py new file mode 100644 index 0000000..a4155b5 --- /dev/null +++ b/examples/examples_seq2seq/data_processors/postprocessors.py @@ -0,0 +1,64 @@ +import abc +from collections import OrderedDict +import numpy as np + +"""Defines functions to process the outputs to make them ready for the evaluation.""" + +def string_to_float(string, default=-1., **unused_kwargs): + """Converts string to float, using default when conversion not possible.""" + try: + return float(string) + except ValueError: + return default + + +class PostProcessor(abc.ABC): + """Postprocess the predictions and labels to make them suitable for + evaluation.""" + def __init__(self, tokenizer, ignore_pad_token_for_loss): + self.tokenizer = tokenizer + self.ignore_pad_token_for_loss = ignore_pad_token_for_loss + + def process(self, preds, labels, data_info=None): + if isinstance(preds, tuple): + preds = preds[0] + decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) + if self.ignore_pad_token_for_loss: + # Replace -100 in the labels as we can't decode them. + labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id) + decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) + # Some simple post-processing + decoded_preds = [pred.strip() for pred in decoded_preds] + decoded_labels = [label.strip() for label in decoded_labels] + return decoded_preds, decoded_labels + + +class MultiRC(PostProcessor): + def process(self, preds, labels, data_info): + preds, labels = super().process(preds, labels, data_info) + preds = [{"group": info["group"], "value":pred} \ + for info, pred in zip(data_info, preds)] + labels = [{"group": info["group"], "value": label}\ + for info, label in zip(data_info, labels)] + return preds, labels + +class Record(PostProcessor): + def process(self, preds, labels, data_info): + preds, labels = super().process(preds, labels, data_info) + labels = [info["answers"] for info in data_info] + return preds, labels + + +POSTPROCESSOR_MAPPING = OrderedDict( + [ + ('superglue-record', Record), + ('superglue-multirc', MultiRC) + ] +) + +class AutoPostProcessor: + @classmethod + def get(self, task, tokenizer, ignore_pad_token_for_loss): + if task in POSTPROCESSOR_MAPPING: + return POSTPROCESSOR_MAPPING[task](tokenizer, ignore_pad_token_for_loss) + return PostProcessor(tokenizer, ignore_pad_token_for_loss) diff --git a/examples/examples_seq2seq/data_processors/tasks.py b/examples/examples_seq2seq/data_processors/tasks.py new file mode 100644 index 0000000..a4f8f44 --- /dev/null +++ b/examples/examples_seq2seq/data_processors/tasks.py @@ -0,0 +1,584 @@ +from collections import OrderedDict +import collections +import abc +import functools +from typing import Callable, List, Mapping +from examples_seq2seq.trainers.trainer_utils import pad_punctuation +from examples_seq2seq.metrics import metrics +from .utils import round_stsb_target +import datasets +import logging +import numpy as np +import torch +import re + +logger = logging.getLogger(__name__) + +class AbstractTask(abc.ABC): + name = NotImplemented + config = NotImplemented + prefix = NotImplemented + preprocessor: Callable = NotImplemented + metric = NotImplemented + metric_names = NotImplemented + split_map = None + labels_list = None + split_to_data_split: Mapping[str, str] = \ + {"train": "train", "validation": "validation", "test": "test"} + small_datasets_without_all_splits = ["cola", "wnli", "rte", "superglue-cb", "superglue-copa", "superglue-multirc", + "superglue-wic", "superglue-wsc.fixed", "superglue-rte", "mrpc", "stsb", + "superglue-boolq"] + large_data_without_all_splits = ["qqp", "qnli", "superglue-record", "sst2"] + + def __init__(self, config, seed=42): + self.config = config + self.seed = seed + + def get_max_target_length(self, tokenizer, default_max_length): + if self.labels_list is not None: + return max([len(tokenizer.encode(label)) for label in self.labels_list]) + return default_max_length + + def seq2seq_format(self, sources: List[str], + targets: List[str], + add_prefix: bool=False, + prefix: str=None, + extra_fields={}): + src_prefix = self.name if prefix is None else prefix + sources = [src_prefix]+sources if add_prefix else sources + return {'source': ' '.join(sources), + 'target': ' '.join(targets), + 'task': self.name, + 'extra_fields': extra_fields} + + def check_n_obs(self, n_obs, total_size): + if n_obs is not None and n_obs > total_size: + n_obs = total_size + logger.warning("n_obs is set to %s", n_obs) + return n_obs + + def shuffled_indices(self, dataset): + num_samples = len(dataset) + generator = torch.Generator() + generator.manual_seed(self.seed) + return torch.randperm(num_samples, generator=generator).tolist() + + def subsample(self, dataset, n_obs=None, indices=None): + """ + Given a dataset returns the subsampled dataset. + :param n_obs: the number of samples of the subsampled dataset. + :param indices: indices to select the samples from, if not given, indices are computed + from by shuffling the given dataset. + :return: subsampled dataset. + """ + num_samples = len(dataset) + n_obs = self.check_n_obs(n_obs, num_samples) + if indices is None: + indices = self.shuffled_indices(dataset) + indices = indices[:n_obs] + return dataset.select(indices) + + def load_dataset(self, split: int): + return datasets.load_dataset(self.name, self.config, split=split, script_version="master") + + def get_split_indices(self, split, dataset, validation_size): + indices = self.shuffled_indices(dataset) + if split == "validation": + return indices[:validation_size] + else: + return indices[validation_size:] + + def map_dataset(self, dataset, add_prefix): + return dataset.map(functools.partial(self.preprocessor, add_prefix=add_prefix), + remove_columns=dataset.column_names) + + def get(self, split, add_prefix=True, n_obs=None, split_validation_test=False): + # For small datasets (n_samples < 10K) without test set, we divide validation set to + # half, use one half as test set and one half as validation set. + if split_validation_test and self.name in self.small_datasets_without_all_splits \ + and split != "train": + mapped_split = self.split_to_data_split["validation"] + dataset = self.load_dataset(split=mapped_split) + indices = self.get_split_indices(split, dataset, validation_size=len(dataset)//2) + dataset = self.subsample(dataset, n_obs, indices) + # For larger datasets (n_samples > 10K), we divide training set into 1K as + # validation and the rest as training set, keeping the original validation + # set as the test set. + elif split_validation_test and self.name in self.large_data_without_all_splits \ + and split != "test": + dataset = self.load_dataset(split="train") + indices = self.get_split_indices(split, dataset, validation_size=1000) + dataset = self.subsample(dataset, n_obs, indices) + else: + mapped_split = self.split_to_data_split[split] + dataset = self.load_dataset(split=mapped_split) + # shuffles the data and samples it. + if n_obs is not None: + dataset = self.subsample(dataset, n_obs) + return self.map_dataset(dataset, add_prefix) + +class Squad(AbstractTask): + name = "squad" + metric = [metrics.squad] + + def load_dataset(self, split): + return datasets.load_dataset(self.name, split=split, script_version="master") + + def preprocessor(self, example, add_prefix): + answer = pad_punctuation(example['answers']['text'][0]) + question = pad_punctuation(example['question']) + context = pad_punctuation(example['context']) + source = ["question:", question, + "context:", context] + target = [answer] + return self.seq2seq_format(source, target, add_prefix) + + +class MRPC(AbstractTask): + name = "mrpc" + labels_list = ["0", "1"] + metric = [metrics.f1_score_with_invalid, metrics.accuracy] + metric_names = ["f1", "accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'mrpc', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["sentence1:", example['sentence1'], + "sentence2:", example["sentence2"]] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class COLA(AbstractTask): + name = "cola" + labels_list = ["0", "1"] + metric = [metrics.matthews_corrcoef] + metric_names = ["matthews_correlation"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'cola', + split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["sentence:", example['sentence']] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SST2(AbstractTask): + name = "sst2" + labels_list = ["0", "1"] + metric = [metrics.accuracy] + metric_names = ["accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'sst2', + split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["sentence:", example['sentence']] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class STSB(AbstractTask): + name = "stsb" + labels_list = [str(np.round(label, decimals=1)) for label in np.arange(0, 5.2, 0.2)] + metric = [metrics.pearson_corrcoef, metrics.spearman_corrcoef] + metric_names = ["pearson", "spearmanr"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'stsb', + split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["sentence1:", example['sentence1'], + "sentence2:", example["sentence2"]] + tgt_texts = [str(round_stsb_target(example['label']))] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class QQP(AbstractTask): + name = "qqp" + labels_list = ["0", "1"] + metric = [metrics.f1_score_with_invalid, metrics.accuracy] + metric_names = ["f1", "accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'qqp', + split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["question1:", example['question1'], + "question2:", example["question2"]] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class MNLI(AbstractTask): + name = "mnli" + labels_list = ["0", "1", "2"] + split_to_data_split = {"train": "train", + "validation": "validation_mismatched", + "test": "validation_matched"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'mnli', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["premise:", example['premise'], + "hypothesis", example["hypothesis"]] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class QNLI(AbstractTask): + name = "qnli" + labels_list = ["0", "1"] + metric = [metrics.accuracy] + metric_names = ["accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'qnli', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["question:", example['question'], + "sentence:", example["sentence"]] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + +class RTE(AbstractTask): + name = "rte" + labels_list = ["0", "1"] + metric = [metrics.accuracy] + metric_names = ["accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'rte', + split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["sentence1:", example['sentence1'], + "sentence2:", example["sentence2"]] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class WNLI(AbstractTask): + name = "wnli" + labels_list = ["0", "1"] + metric = [metrics.accuracy] + metric_names = ["accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('glue', 'wnli', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["sentence1:", example['sentence1'], + "sentence2:", example["sentence2"]] + tgt_texts = [str(example['label'])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SuperGLUEBoolQ(AbstractTask): + name="superglue-boolq" + labels_list = ['0', '1'] + metric = [metrics.accuracy] + metric_names = ["accuracy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'boolq', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["question:", example["question"], "passage:", example["passage"]] + tgt_texts = [str(example["label"])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SuperGLUERTE(AbstractTask): + name="superglue-rte" + labels_list = ['0', '1'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'rte', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["premise:", example["premise"], + "hypothesis:", example["hypothesis"]] + tgt_texts = [str(example["label"])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SuperGLUECB(AbstractTask): + name = "superglue-cb" + labels_list = ['0', '1', '2'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.mean_multiclass_f1(num_classes=3), metrics.accuracy] + metric_names = ["f1_multiclass", "accuracy"] + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'cb', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["premise:", example["premise"], "hypothesis:", example["hypothesis"]] + tgt_texts = [str(example["label"])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SuperGLUECOPA(AbstractTask): + name = "superglue-copa" + labels_list = ['0', '1'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'copa', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["premise:", example["premise"], + "choice1:", example["choice1"], + "choice2:", example["choice2"]] + tgt_texts = [str(example["label"])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SuperGLUEMultiRC(AbstractTask): + name = "superglue-multirc" + labels_list = ['0', '1'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.multirc_f1_over_all_answers, + metrics.mean_group_metric(metrics.exact_match)] + metric_names = ["f1", "em"] + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'multirc', split=split, script_version="master") + + def remove_markup(self, text): + """Removes the HTML markup.""" + text = re.sub('
', ' ', text) + text = re.sub('<(/)?b>', '', text) + return text + + def preprocessor(self, example, add_prefix=True): + group = example['idx']['question'] + # T5 applies remove_markup to the joined string, but this should not make + # any difference as well. + # https://github.com/google-research/text-to-text-transfer-transformer/blob/a1352e625db7ec114062f99d99b0565b9e45c155/t5/data/preprocessors.py#L797 + src_texts = ["question:", self.remove_markup(example["question"]), + "answer:", self.remove_markup(example["answer"]), + "paragraph:", self.remove_markup(example["paragraph"])] + tgt_texts = [str(example["label"])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix, extra_fields={"group": group}) + + + +class SuperGLUEWIC(AbstractTask): + name = "superglue-wic" + labels_list = ['0', '1'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'wic', split=split, script_version="master") + + def preprocessor(self, example, add_prefix=True): + src_texts = ["sentence1:", example["sentence1"], + "sentence2:", example["sentence2"], + "word:", example["word"]] + tgt_texts = [str(example["label"])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SuperGLUEWSCFixed(AbstractTask): + # source: https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/data/preprocessors.py + """Convert WSC examples to text2text format. + WSC includes a sentence along with 2 'spans': the first denoting a noun and + the other a pronoun. The 'label' specifies whether or not the pronoun is + referencing the noun. This preprocessor puts ' * ' around the noun and ' # ' + around the pronoun. + For example, a typical example from WSC might look like + { + 'text': 'This is a test sentence .', + 'span1_text': 'test', + 'span1_index': 3, + 'span2_text': 'This', + 'span2_index': 0, + 'label': 0 + } + This example would be transformed to + { + 'inputs': 'wsc text: # This # is a * test * sentence .', + 'targets': 'False' + } + """ + name = "superglue-wsc.fixed" + labels_list = ['0', '1'] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'wsc.fixed', split=split, script_version="master") + + def _mark_span(self, text, span_str, span_idx, mark): + pattern_tmpl = r'^((?:\S+\s){N})(W)' + pattern = re.sub('N', str(span_idx), pattern_tmpl) + pattern = re.sub('W', span_str, pattern) + return re.sub(pattern, r'\1{0} \2 {0}'.format(mark), text) + + def preprocessor(self, example, add_prefix=True): + # converts text as done in T5. + text = example['text'] + text = self._mark_span(text, example['span1_text'], example['span1_index'], '*') + # Compensate for 2 added "words" added in previous step. + span2_index = example['span2_index'] + 2 * int(example['span1_index'] < example['span2_index']) + text = self._mark_span(text, example['span2_text'], span2_index, '#') + src_texts = ["text:", text] + tgt_texts = [str(example["label"])] + return self.seq2seq_format(src_texts, tgt_texts, add_prefix) + + +class SuperGLUERecord(AbstractTask): + """Convert ReCoRD examples to text2text examples. + ReCoRD contains a passage, query containing a '@placeholder' string, and a set + of entities that are the possible values of the placeholder. Each train and + validation example will have a list of answers, any of which would be + considered correct. + For example, a typical example from ReCoRD might look like + { + 'passsage': 'This is the passage.', + 'query': 'A @placeholder is a bird.', + 'entities': ['penguin', 'potato', 'pigeon'], + 'answers': ['penguin', 'pigeon'], + } + which this preprocessor would turn into the following two examples: + { + 'inputs': 'record query: A @placeholder is a bird. entities: penguin, ' + 'potato, pigeon passage: This is the passage.', + 'targets': 'penguin', + } + and + { + 'inputs': 'record query: A @placeholder is a bird. entities: penguin, ' + 'potato, pigeon passage: This is the passage.', + 'targets': 'pigeon', + } + """ + name = "superglue-record" + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.squad] + metric_names = ["squad"] + + def load_dataset(self, split): + return datasets.load_dataset('super_glue', 'record', split=split, script_version="master") + + def preprocessor(self, batch, add_prefix=True): + new_batch = collections.defaultdict(list) + keys = batch.keys() + for values in zip(*batch.values()): + ex = {k: v for k, v in zip(keys, values)} + # updates the passage. + passage = ex['passage'] + passage = re.sub(r'(\.|\?|\!|\"|\')\n@highlight\n', r'\1 ', passage) + passage = re.sub(r'\n@highlight\n', '. ', passage) + inputs = f"record query: {ex['query']} entities: {', '.join(ex['entities'])} passage: {passage}" + if add_prefix: + inputs = self.name + " " + inputs + # duplicates the samples based on number of answers. + num_answers = len(ex["answers"]) + num_duplicates = np.maximum(1, num_answers) + new_batch["source"].extend([inputs] * num_duplicates) + new_batch["target"].extend(ex["answers"] if num_answers > 0 else [""]) + new_batch["task"].extend([self.name] * num_duplicates) + new_batch["extra_fields"].extend([{"answers": ex["answers"]}]*num_duplicates) + return new_batch + + def map_dataset(self, dataset, add_prefix=True): + return dataset.map(functools.partial(self.preprocessor, add_prefix=add_prefix), + batched=True, remove_columns=dataset.column_names) + + +TASK_MAPPING = OrderedDict( + [ + ('squad', Squad), + ('mrpc', MRPC), + ('cola', COLA), + ('sst2', SST2), + ('qnli', QNLI), + ('rte', RTE), + ('wnli', WNLI), + ('mnli', MNLI), + ('qqp', QQP), + ('stsb', STSB), + ('superglue-boolq', SuperGLUEBoolQ), + ('superglue-rte', SuperGLUERTE), + ('superglue-cb', SuperGLUECB), + ('superglue-copa', SuperGLUECOPA), + ('superglue-multirc', SuperGLUEMultiRC), + ('superglue-wic', SuperGLUEWIC), + ('superglue-wsc.fixed', SuperGLUEWSCFixed), + ('superglue-record', SuperGLUERecord) + ] +) + +class AutoTask: + @classmethod + def get(self, task, config, seed=42): + if task in TASK_MAPPING: + return TASK_MAPPING[task](config, seed) + raise ValueError( + "Unrecognized task {} for AutoTask Model: {}.\n" + "Task name should be one of {}.".format( + ", ".join(c for c in TASK_MAPPING.keys()) + ) + ) diff --git a/examples/examples_seq2seq/data_processors/utils.py b/examples/examples_seq2seq/data_processors/utils.py new file mode 100644 index 0000000..1445974 --- /dev/null +++ b/examples/examples_seq2seq/data_processors/utils.py @@ -0,0 +1,17 @@ +import numpy as np + +def round_stsb_target(label): + """STSB maps two sentences to a floating point number between 1 and 5 + representing their semantic similarity. Since we are treating all tasks as + text-to-text tasks we need to convert this floating point number to a string. + The vast majority of the similarity score labels in STSB are in the set + [0, 0.2, 0.4, ..., 4.8, 5.0]. So, we first round the number to the closest + entry in this set, and then we convert the result to a string (literally e.g. + "3.4"). This converts STSB roughly into a 26-class classification dataset. + Args: + label: original label. + Returns: + A preprocessed label. + """ + return np.round((label * 5) / 5, decimals=1) + diff --git a/examples/examples_seq2seq/metrics/__init__.py b/examples/examples_seq2seq/metrics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/examples_seq2seq/metrics/metrics.py b/examples/examples_seq2seq/metrics/metrics.py new file mode 100644 index 0000000..65d4567 --- /dev/null +++ b/examples/examples_seq2seq/metrics/metrics.py @@ -0,0 +1,173 @@ +# several of the evaluation metrics are from https://github.com/google-research/text-to-text-transfer-transformer/blob/a1352e625db7ec114062f99d99b0565b9e45c155/t5/evaluation/metrics.py +"""Defines different metrics used for evaluation of tasks.""" +import numpy as np +import scipy +import math +import sklearn +import collections +from logging import getLogger +from .qa_utils import normalize_squad, qa_metrics +import sklearn.metrics + +logger = getLogger(__name__) + +def accuracy(predictions, targets) -> dict: + """Computes the average accuracy.""" + return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())} + +def pearson_corrcoef(predictions, targets) -> dict: + """Computes Pearson correlation coefficient.""" + from examples_seq2seq.data_processors.postprocessors import string_to_float + targets = [string_to_float(target) for target in targets] + predictions= [string_to_float(prediction) for prediction in predictions] + pearson_corrcoef = 100 * scipy.stats.pearsonr(targets, predictions)[0] + + # Note that if all the predictions will be the same, spearman + # correlation is nan, to gaurad against this, we check the output + # and return 0 in this case. + if math.isnan(pearson_corrcoef): + pearson_corrcoef = 0 + return {"pearson": pearson_corrcoef} + + +def spearman_corrcoef(predictions, targets) -> dict: + """Computes Spearman correlation coefficient.""" + # TODO: we need to do postprocessors in a clean way for each dataset. + from examples_seq2seq.data_processors.postprocessors import string_to_float + targets = [string_to_float(target) for target in targets] + predictions= [string_to_float(prediction) for prediction in predictions] + spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0] + + # Note that if all the predictions will be the same, spearman + # correlation is nan, to gaurad against this, we check the output + # and return 0 in this case. + if math.isnan(spearman_corrcoef): + spearman_corrcoef = 0 + return {"spearmanr": spearman_corrcoef} + + +def f1_score_with_invalid(predictions, targets) -> dict: + """Computes F1 score, with any prediction != 0 or 1 is counted as incorrect. + Args: + targets: list of targets, either 0 or 1 + predictions: list of predictions, any integer value + Returns: + F1 score, where any prediction != 0 or 1 is counted as wrong. + """ + def binary_reverse(labels): + return ['0' if label == '1' else '1' for label in labels] + targets, predictions = np.asarray(targets), np.asarray(predictions) + # Get indices of invalid predictions. + invalid_idx_mask = np.logical_and(predictions != '0', predictions != '1') + # For any prediction != 0 or 1, we set the prediction to the opposite of its corresponding target. + predictions[invalid_idx_mask] = binary_reverse(targets[invalid_idx_mask]) + targets = targets.astype(np.int32) + predictions = predictions.astype(np.int32) + return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)} + +# TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow +def matthews_corrcoef(predictions, targets) -> dict: + """Computes the Matthews correlation coefficient.""" + return {"matthews_correlation": 100 * sklearn.metrics.matthews_corrcoef(targets, predictions)} + +def squad(predictions, targets): + """Computes SQuAD metrics, maximizing over answers per question. + Args: + targets: list of lists of strings + predictions: list of strings + Returns: + dict with score_key: squad score across all targets and predictions + """ + + targets = [[normalize_squad(t) for t in u] for u in targets] + predictions = [normalize_squad(p) for p in predictions] + return qa_metrics(targets, predictions) + + +def exact_match(predictions, targets): + """Computes whether the targets match predictions exactly.""" + return {"em": 100 * float(np.array_equal(targets, predictions))} + + +def sklearn_metrics_wrapper(metric_str, + metric_dict_str=None, + metric_post_process_fn=None, + **metric_fn_kwargs): + """Wraps any sklearn.metric function and returns a t5 metric function. + Args: + metric_str: string, the function from `sklearn.metrics` to use. + metric_dict_str: optional string, if not specified `metric_str` is used as + the key in the returned dictionary. + metric_post_process_fn: callable, if specified the final computed metric + will be passed through this. + **metric_fn_kwargs: kwargs, passed to the metric function we are calling. + Returns: + the function that calculates the metric in a dict. + """ + if not hasattr(sklearn.metrics, metric_str): + raise ValueError("sklearn.metrics does not have: %s" % metric_str) + + def fn(predictions, targets): + metric_fn = getattr(sklearn.metrics, metric_str) + metric_val = metric_fn(targets, predictions, **metric_fn_kwargs) + if metric_post_process_fn is not None: + metric_val = metric_post_process_fn(metric_val) + return {metric_dict_str or metric_str: metric_val} + return fn + + +def mean_multiclass_f1(num_classes, **metric_fn_kwargs): + """Computes the unweighted average of the F1 per class.""" + return sklearn_metrics_wrapper( + "fbeta_score", + metric_dict_str="f1_multiclass", + metric_post_process_fn=lambda x: 100 * x, + beta=1, + labels=range(num_classes), + average="macro", + **metric_fn_kwargs) + + +def multirc_f1_over_all_answers(targets, predictions): + """Special metric for MultiRC which computes F1 score over all examples. + This is necessary because the targets/predictions for MultiRC are dicts and + the f1_score_with_invalid expects a list of True/False labels, not dicts. As + a result we just need to key in the "value" for each of the example dicts + before feeding into f1_score_with_invalid. + Args: + targets: list of dicts, where each dict has a "value" key. + predictions: list of dicts, where each dict has a "value" key. + Returns: + F1 score over values, where any prediction != 0 or 1 is counted as wrong. + """ + return f1_score_with_invalid( + [t["value"] for t in targets], [p["value"] for p in predictions] + ) + + +def mean_group_metric(metric_fn, group_key="group", value_key="value"): + """Returns a metric that averages `metric_fn` on sub-groups of results. + The sub-groups are defined by aggregating results (targets and predictions) + by accessing the feature specified by `group_key` in the target dicts. + **WARNING**: Using this function can produce unreliable results if you do not + pass in full groups. For example, if you evaluate over a random subsample of a + validation set and do not retain all of the examples in each group, you may + get results which aren't directly comparable to using the full validation set. + Args: + metric_fn: function, the metric to compute on the subgroups. + group_key: string, the key for the grouping value in the target dictionary. + value_key: string, the key for the value in the dictionaries. + """ + def my_metric(targets, predictions): + """Computes mean of `metric_fn` over subgroups of results.""" + grouped_values = collections.defaultdict(lambda: ([], [])) + for targ, pred in zip(targets, predictions): + g = targ[group_key] + grouped_values[g][0].append(targ[value_key]) + grouped_values[g][1].append(pred[value_key]) + group_scores = collections.defaultdict(list) + for (targets, predictions) in grouped_values.values(): + for metric, score in metric_fn(targets, predictions).items(): + group_scores[metric].append(score) + return {metric: np.mean(scores) for metric, scores in group_scores.items()} + return my_metric diff --git a/examples/examples_seq2seq/metrics/qa_utils.py b/examples/examples_seq2seq/metrics/qa_utils.py new file mode 100644 index 0000000..fe3fb0c --- /dev/null +++ b/examples/examples_seq2seq/metrics/qa_utils.py @@ -0,0 +1,96 @@ +# Copyright 2021 The T5 Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# source: the codes are from https://github.com/google-research/text-to-text-transfer-transformer +"""Utilities for Question Answering (QA) evaluation. +Matches results on the SQuAD (v1.1) and TriviaQA (v1.0) evaluation scripts. +""" + +import collections +import string +import regex as re +import numpy as np + + +def _normalize_answer(text, punc_chars, punc_repl): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(s): + return re.sub(r"\b(a|an|the)\b", " ", s) + + def replace_punctuation(s): + to_replace = set(punc_chars) + return "".join(punc_repl if ch in to_replace else ch for ch in s) + + def white_space_fix(s): + return " ".join(s.split()) + + text = text.lower() + text = replace_punctuation(text) + text = remove_articles(text) + text = white_space_fix(text) + return text + + +def normalize_trivia_qa(answer): + """Normalization used in official TriviaQA evaluation script.""" + return _normalize_answer( + answer, punc_chars=string.punctuation + "‘’´`_", punc_repl=" ").strip() + + +def normalize_squad(answer): + """Normalization used in official SQuAD evaluation script.""" + return _normalize_answer(answer, punc_chars=string.punctuation, punc_repl="") + + +def _metric_max_over_ground_truths(metric_fn, ground_truths, prediction): + """Computes the maximum of the metric over all ground truths.""" + return max( + metric_fn(ground_truth, prediction) for ground_truth in ground_truths + ) + + +def _exact_match_score(target, prediction): + return target == prediction + + +def _f1_score(target, prediction): + """Computes token f1 score for a single target and prediction.""" + prediction_tokens = prediction.split() + target_tokens = target.split() + common = (collections.Counter(prediction_tokens) & + collections.Counter(target_tokens)) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(target_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def qa_metrics(targets, predictions): + """Computes exact match and f1 QA scores, expecting pre-normalized text.""" + if len(targets) != len(predictions): + raise ValueError("Number of targets and predictions must match.") + em = np.mean([ + _metric_max_over_ground_truths(_exact_match_score, t, p) + for p, t in zip(predictions, targets) + ]) + f1 = np.mean([ + _metric_max_over_ground_truths(_f1_score, t, p) + for p, t in zip(predictions, targets) + ]) + em *= 100 + f1 *= 100 + return {"em": em, "f1": f1} diff --git a/examples/examples_seq2seq/run.sh b/examples/examples_seq2seq/run.sh new file mode 100644 index 0000000..fe2c981 --- /dev/null +++ b/examples/examples_seq2seq/run.sh @@ -0,0 +1,7 @@ +files=(cola mnli mrpc qnli qqp rte sst2 stsb superglue-boolq superglue-cb superglue-copa superglue-multirc superglue-record superglue-wic superglue-wsc.fixed) +for ((i=$1; i<=$2; i++)) +do + dataset=${files[i]} + echo "id$i:$dataset" + TOKENIZERS_PARALLELISM=false python run_seq2seq.py configs/$3/$dataset.json +done \ No newline at end of file diff --git a/examples/examples_seq2seq/run_seq2seq.py b/examples/examples_seq2seq/run_seq2seq.py new file mode 100644 index 0000000..90d6def --- /dev/null +++ b/examples/examples_seq2seq/run_seq2seq.py @@ -0,0 +1,468 @@ +# coding=utf-8 +# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for sequence to sequence. +""" +# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments. +import functools +import logging +from opendelta.utils.delta_hub import create_hub_repo_name +import torch +import os +os.environ['MKL_THREADING_LAYER'] = 'GNU' +os.environ['MKL_SERVICE_FORCE_INTEL'] = '1' +import sys +import subprocess +from typing import Optional, List + +from datasets import load_dataset, load_metric, concatenate_datasets +import transformers +from transformers import ( + AutoConfig, + AutoModelForSeq2SeqLM, + AutoTokenizer, + HfArgumentParser, + MBartTokenizer, + default_data_collator, + set_seed, +) +from transformers.trainer_utils import is_main_process, get_last_checkpoint +# from ..seq2seq.utils import get_adapter_config +from examples_seq2seq.data_processors import AutoTask, TaskDataCollatorForSeq2Seq, AutoPostProcessor +from examples_seq2seq.seq2seq_trainer import Seq2SeqTrainer +# from training_args import AdapterTrainingArguments +from examples_seq2seq.trainers.trainer_utils import save_training_config +from dataclasses import dataclass, field + +from transformers.models.t5.modeling_t5 import T5Config, T5ForConditionalGeneration +from examples_seq2seq.trainers.model_args import ModelArguments +from examples_seq2seq.trainers.trainer_args import TrainingArguments, DataTrainingArguments + +logger = logging.getLogger(__name__) + +def run_command(command): + output = subprocess.getoutput(command) + return output + + +TASK_TO_METRICS = {"mrpc": ["accuracy", "f1"], + "cola": ['matthews_correlation'], + "stsb": ['pearson', 'spearmanr'], + 'sst2': ['accuracy'], + "mnli": ["accuracy"], + "mnli_mismatched": ["accuracy"], + "mnli_matched": ["accuracy"], + "qnli": ["accuracy"], + "rte": ["accuracy"], + "wnli": ["accuracy"], + "qqp": ["accuracy", "f1"], + "superglue-boolq": ["accuracy"], + "superglue-rte": ["accuracy"], + "superglue-cb": ["f1_multiclass", "accuracy"], + "superglue-copa": ["accuracy"], + "superglue-multirc": ["f1", "em"], + "superglue-wic": ["accuracy"], + "superglue-wsc.fixed": ["accuracy"], + "superglue-record": ["f1", "em"] + } + + +class RemainArgHfArgumentParser(HfArgumentParser): + def parse_json_file(self, json_file: str, return_remaining_args=True ): + """ + Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the + dataclass types. + """ + import argparse + import json + from pathlib import Path + import dataclasses + + data = json.loads(Path(json_file).read_text()) + outputs = [] + for dtype in self.dataclass_types: + keys = {f.name for f in dataclasses.fields(dtype) if f.init} + inputs = {k: data.pop(k) for k in list(data.keys()) if k in keys} + obj = dtype(**inputs) + outputs.append(obj) + + remain_args = argparse.ArgumentParser() + remain_args.__dict__.update(data) + if return_remaining_args: + return (*outputs, remain_args) + else: + return (*outputs,) + + +def main(): + + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args, delta_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args, delta_args = parser.parse_args_into_dataclasses() + + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + print("#### last_checkpoint ", last_checkpoint) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + ''' + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + ''' + pass + elif last_checkpoint is not None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + # Set the verbosity to info of the Transformers logger (on main process only): + if is_main_process(training_args.local_rank): + transformers.utils.logging.set_verbosity_info() + logger.info("Training/evaluation parameters %s", training_args) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files in the summarization task, this script will use the first column for the full texts and the + # second column for the summaries (unless you specify column names for this with the `text_column` and + # `summary_column` arguments). + # For translation, only JSON files are supported, with one field named "translation" containing two keys for the + # source and target languages (unless you adapt what follows). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + model = AutoModelForSeq2SeqLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + model.resize_token_embeddings(len(tokenizer)) + + + if delta_args.delta_type.lower() != "none": + from opendelta import AutoDeltaConfig,AutoDeltaModel + delta_config = AutoDeltaConfig.from_dict(vars(delta_args)) + delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=model) + delta_model.freeze_module(set_state_dict = True) + delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True) + + + # model parallelize + if hasattr(training_args, "model_parallel") and training_args.model_parallel: + logger.info('parallelize model!') + model.parallelize() + + data_args.dataset_name = [data_args.task_name] + data_args.eval_dataset_name = [data_args.eval_dataset_name] + data_args.test_dataset_name = [data_args.test_dataset_name] + data_args.dataset_config_name = [data_args.dataset_config_name] + data_args.eval_dataset_config_name = [data_args.eval_dataset_config_name] + data_args.test_dataset_config_name = [data_args.test_dataset_config_name] + assert len(data_args.dataset_name) == len(data_args.dataset_config_name) + if data_args.eval_dataset_name is not None: + assert len(data_args.eval_dataset_name) == len(data_args.eval_dataset_config_name) + if data_args.test_dataset_name is not None: + assert len(data_args.test_dataset_name) == len(data_args.test_dataset_config_name) + + # Temporarily set max_target_length for training. + #max_target_length = data_args.max_target_length + padding = "max_length" if data_args.pad_to_max_length else False + + def preprocess_function(examples, max_target_length): + # max_target_length += 1 + # model_inputs = tokenizer([s+"" for s in examples['source']], max_length=data_args.max_source_length, + # padding=padding, truncation=True) + # # Setup the tokenizer for targets + # with tokenizer.as_target_tokenizer(): + # labels = tokenizer([''+t for t in examples['target']], max_length=max_target_length, padding=padding, truncation=True) + model_inputs = tokenizer([s for s in examples['source']], max_length=data_args.max_source_length, + padding=padding, truncation=True) + # Setup the tokenizer for targets + with tokenizer.as_target_tokenizer(): + labels = tokenizer([t for t in examples['target']], max_length=max_target_length, padding=padding, truncation=True) + # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore + # padding in the loss. + if padding == "max_length" and data_args.ignore_pad_token_for_loss: + labels["input_ids"] = [ + [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] + ] + model_inputs["labels"] = labels["input_ids"] + model_inputs["extra_fields"] = examples['extra_fields'] + return model_inputs + + column_names = ['source', 'target', 'extra_fields'] + performance_metrics = {} + if training_args.do_train: + train_datasets = [AutoTask.get(dataset_name, + dataset_config_name, + seed=data_args.data_seed).get( + split="train", + split_validation_test=training_args.split_validation_test, + add_prefix=True, + n_obs=data_args.max_train_samples) + for dataset_name, dataset_config_name\ + in zip(data_args.dataset_name, data_args.dataset_config_name)] + max_target_lengths = [AutoTask.get(dataset_name, dataset_config_name).get_max_target_length(\ + tokenizer=tokenizer, default_max_length=data_args.max_target_length)\ + for dataset_name, dataset_config_name in zip(data_args.dataset_name, data_args.dataset_config_name)] + for i, train_dataset in enumerate(train_datasets): + train_datasets[i] = train_datasets[i].map( + functools.partial(preprocess_function, max_target_length=max_target_lengths[i]), + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, # if train_dataset != "superglue-record" else column_names+["answers"], + load_from_cache_file=not data_args.overwrite_cache, + ) + train_dataset = concatenate_datasets(train_datasets) + + if training_args.do_eval: + eval_datasets = {eval_dataset: AutoTask.get(eval_dataset, eval_dataset_config, + seed=data_args.data_seed).get( + split="validation", + split_validation_test=training_args.split_validation_test, + add_prefix=True, + n_obs=data_args.max_val_samples) + for eval_dataset, eval_dataset_config in zip(data_args.eval_dataset_name, data_args.eval_dataset_config_name)} + max_target_lengths = [AutoTask.get(dataset_name, dataset_config_name).get_max_target_length( \ + tokenizer=tokenizer, default_max_length=data_args.max_target_length) \ + for dataset_name, dataset_config_name in zip(data_args.eval_dataset_name, data_args.eval_dataset_config_name)] + for k, name in enumerate(eval_datasets): + eval_datasets[name] = eval_datasets[name].map( + functools.partial(preprocess_function, max_target_length=max_target_lengths[k]), + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, # if name != "superglue-record" else column_names+["answers"], + load_from_cache_file=not data_args.overwrite_cache, + ) + + if training_args.do_test: + test_datasets = {test_dataset: AutoTask.get(test_dataset, test_dataset_config, + seed=data_args.data_seed).get( + split="test", + split_validation_test=training_args.split_validation_test, + add_prefix=True, + n_obs=data_args.max_test_samples) + for test_dataset, test_dataset_config in zip(data_args.test_dataset_name, data_args.test_dataset_config_name)} + max_target_lengths = [AutoTask.get(dataset_name, dataset_config_name).get_max_target_length( \ + tokenizer=tokenizer, default_max_length=data_args.max_target_length) \ + for dataset_name, dataset_config_name in zip(data_args.test_dataset_name, data_args.test_dataset_config_name)] + for k, name in enumerate(test_datasets): + test_datasets[name] = test_datasets[name].map( + functools.partial(preprocess_function, max_target_length=max_target_lengths[k]), + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + ) + + # Data collator + label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id + if data_args.pad_to_max_length: + data_collator = default_data_collator + else: + data_collator = TaskDataCollatorForSeq2Seq( + tokenizer, + label_pad_token_id=label_pad_token_id, + pad_to_multiple_of=8 if training_args.fp16 else None, + ) + + + # Metric, we assume we have only one training task. + eval_metrics = [AutoTask.get(dataset_name, dataset_config_name).metric\ + for dataset_name, dataset_config_name in zip(data_args.dataset_name, data_args.dataset_config_name)][0] + + # Extracts the extra information needed to evaluate on each dataset. + # These information are only used in the compute_metrics. + # We will assume that the test/eval dataloader does not change the order of + # the data. + data_info = {"eval": eval_datasets[data_args.eval_dataset_name[0]]['extra_fields'], + "test": test_datasets[data_args.test_dataset_name[0]]['extra_fields'], + "train": train_dataset['extra_fields']} + def compute_metrics(eval_preds): + preds, labels, data_info = eval_preds + post_processor = AutoPostProcessor.get(data_args.dataset_name[0], tokenizer, + data_args.ignore_pad_token_for_loss) + decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info) + result = {} + for metric in eval_metrics: + result.update(metric(decoded_preds, decoded_labels)) + return result + + + # Initialize our Trainer + trainer = Seq2SeqTrainer( + model=model, + args=training_args, + delta_args=delta_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=list(eval_datasets.values())[0] if training_args.do_eval else None, + data_info = data_info, + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics if training_args.predict_with_generate else None, + evaluation_metrics = TASK_TO_METRICS[data_args.dataset_name[0]], + ) + + + # Saves training config. + if trainer.is_world_process_zero(): + os.makedirs(training_args.output_dir, exist_ok=True) + save_training_config(sys.argv[1], training_args.output_dir) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + + if training_args.compute_time: + torch.cuda.synchronize() # wait for move to complete + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + + train_result = trainer.train(resume_from_checkpoint=checkpoint) + + if training_args.compute_time: + end.record() + torch.cuda.synchronize() # wait for all_reduce to complete + total_time = start.elapsed_time(end)/(1000*60) + performance_metrics.update({"total_time in minutes ": total_time}) + + trainer.save_model() # Saves the tokenizer too for easy upload + train_metrics = train_result.metrics + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + train_metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + trainer.log_metrics("train", train_metrics) + trainer.save_metrics("train", train_metrics) + trainer.save_state() + + if torch.cuda.is_available() and training_args.compute_memory: + peak_memory = (torch.cuda.max_memory_allocated() / 1024 ** 2)/1000 + print( + "Memory utilization", + peak_memory, + "GB" + ) + performance_metrics.update({"peak_memory": peak_memory}) + if training_args.compute_memory or training_args.compute_time: + print(performance_metrics) + trainer.save_metrics("performance", performance_metrics) + + # Evaluation + results = {} + if training_args.do_eval: + logger.info("*** Evaluate ***") + for task, eval_dataset in eval_datasets.items(): + metrics = trainer.evaluate(eval_dataset=eval_dataset, + max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, + ) + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + results['evaluate'] = metrics + + # Test + if training_args.do_test: + logger.info("*** Test ***") + for task, test_dataset in test_datasets.items(): + metrics = trainer.evaluate(eval_dataset=test_dataset, + max_length=data_args.test_max_target_length, num_beams=data_args.num_beams, + metric_key_prefix="test" + ) + trainer.log_metrics("test", metrics) + trainer.save_metrics("test", metrics) + results['test'] = metrics + + repo_name = create_hub_repo_name(root="DeltaHub", + dataset=data_args.task_name, + delta_type = delta_args.delta_type, + model_name_or_path= model_args.model_name_or_path) + results['repo_name'] = repo_name + if training_args.push_to_hub: # TODO add description here + delta_model.save_finetuned(push_to_hub=True, save_directory=repo_name, use_auth_token=True) + # trainer.push_to_hub(**kwargs) + else: + delta_model.save_finetuned(push_to_hub=False, save_directory=repo_name, use_auth_token=True) + + return results + + + + +if __name__ == "__main__": + result = main() + import json + with open("collect_result.jsonl", 'a') as fout: + string = json.dumps(result, indent=4,sort_keys=True) + fout.write(string+"\n") + print(result) diff --git a/examples/examples_seq2seq/seq2seq_trainer.py b/examples/examples_seq2seq/seq2seq_trainer.py new file mode 100644 index 0000000..8f31e54 --- /dev/null +++ b/examples/examples_seq2seq/seq2seq_trainer.py @@ -0,0 +1,127 @@ +from packaging import version +import torch +from torch import nn +from typing import Any, Dict, List, Optional, Tuple, Union + +from torch.utils.data.dataset import Dataset +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainner +from examples_seq2seq.trainers.trainer import BaseTrainer + + # if is_sagemaker_mp_enabled(): +# import smdistributed.modelparallel.torch as smp + +# from transformers.trainer_utils import ShardedDDPOption + +# if is_fairscale_available(): +# dep_version_check("fairscale") +# import fairscale +# from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP +# from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP +# from fairscale.nn.wrap import auto_wrap +# from fairscale.optim import OSS +# from fairscale.optim.grad_scaler import ShardedGradScaler + +from transformers.optimization import Adafactor, AdamW, get_scheduler +from transformers.trainer_pt_utils import get_parameter_names, is_sagemaker_mp_enabled +from transformers.integrations import is_fairscale_available + + + +if version.parse(torch.__version__) >= version.parse("1.6"): + from torch.cuda.amp import autocast + + +class Seq2SeqTrainer(HfSeq2SeqTrainner, BaseTrainer): + def __init__(self, train_dataset_sizes=None, delta_args=None, *args, **kwargs): + super().__init__(*args, **kwargs) + self.train_dataset_sizes = train_dataset_sizes + self.delta_args = delta_args + + def evaluate( + self, + eval_dataset: Optional[Dict[str, Dataset]] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + max_length: Optional[int] = None, + num_beams: Optional[int] = None, + ) -> Dict[str, float]: + # TODO: this also needs to be set per dataset + self._max_length = max_length + self._num_beams = num_beams + return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) + + + def prediction_step( + self, + model: nn.Module, + inputs: Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + gen_kwargs = { + "max_length": self._max_length if self._max_length is not None else self.model.config.max_length, + "num_beams": self._num_beams if self._num_beams is not None else self.model.config.num_beams, + } + generated_tokens = self.model.generate( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + **gen_kwargs, + ) + # in case the batch is shorter than max length, the output should be padded + if generated_tokens.shape[-1] < gen_kwargs["max_length"]: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + + with torch.no_grad(): + if self.use_amp: + with autocast(): + outputs = model(**inputs) + else: + outputs = model(**inputs) + if has_labels: + if self.label_smoother is not None: + loss = self.label_smoother(outputs, inputs["labels"]).mean().detach() + else: + loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach() + else: + loss = None + + if self.args.prediction_loss_only: + return (loss, None, None) + + labels = inputs["labels"] + if labels.shape[-1] < gen_kwargs["max_length"]: + labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + + return (loss, generated_tokens, labels) + + + + + diff --git a/examples/examples_seq2seq/trainers/__init__.py b/examples/examples_seq2seq/trainers/__init__.py new file mode 100644 index 0000000..8a0a403 --- /dev/null +++ b/examples/examples_seq2seq/trainers/__init__.py @@ -0,0 +1,2 @@ +from .trainer import BaseTrainer +from .seq2seq_trainer import Seq2SeqTrainer diff --git a/examples/examples_seq2seq/trainers/model_args.py b/examples/examples_seq2seq/trainers/model_args.py new file mode 100644 index 0000000..35e7785 --- /dev/null +++ b/examples/examples_seq2seq/trainers/model_args.py @@ -0,0 +1,36 @@ +from dataclasses import dataclass, field +from typing import Optional, List + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) \ No newline at end of file diff --git a/examples/examples_seq2seq/trainers/seq2seq_trainer.py b/examples/examples_seq2seq/trainers/seq2seq_trainer.py new file mode 100644 index 0000000..d6a2b80 --- /dev/null +++ b/examples/examples_seq2seq/trainers/seq2seq_trainer.py @@ -0,0 +1,108 @@ +from packaging import version +import torch +from torch import nn +from typing import Any, Dict, List, Optional, Tuple, Union + +from torch.utils.data.dataset import Dataset +from transformers import Seq2SeqTrainer +from .trainer import BaseTrainer + + +if version.parse(torch.__version__) >= version.parse("1.6"): + from torch.cuda.amp import autocast + + +class Seq2SeqTrainer(Seq2SeqTrainer, BaseTrainer): + def __init__(self, train_dataset_sizes=None, delta_args=None, *args, **kwargs): + super().__init__(*args, **kwargs) + self.train_dataset_sizes = train_dataset_sizes + self.delta_args = delta_args + + def evaluate( + self, + eval_dataset: Optional[Dict[str, Dataset]] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + max_length: Optional[int] = None, + num_beams: Optional[int] = None, + ) -> Dict[str, float]: + # TODO: this also needs to be set per dataset + self._max_length = max_length + self._num_beams = num_beams + return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) + + + def prediction_step( + self, + model: nn.Module, + inputs: Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + gen_kwargs = { + "max_length": self._max_length if self._max_length is not None else self.model.config.max_length, + "num_beams": self._num_beams if self._num_beams is not None else self.model.config.num_beams, + } + generated_tokens = self.model.generate( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + **gen_kwargs, + ) + # in case the batch is shorter than max length, the output should be padded + if generated_tokens.shape[-1] < gen_kwargs["max_length"]: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + + with torch.no_grad(): + if self.use_amp: + with autocast(): + outputs = model(**inputs) + else: + outputs = model(**inputs) + if has_labels: + if self.label_smoother is not None: + loss = self.label_smoother(outputs, inputs["labels"]).mean().detach() + else: + loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach() + else: + loss = None + + if self.args.prediction_loss_only: + return (loss, None, None) + + labels = inputs["labels"] + if labels.shape[-1] < gen_kwargs["max_length"]: + labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + + return (loss, generated_tokens, labels) + + + + + diff --git a/examples/examples_seq2seq/trainers/trainer.py b/examples/examples_seq2seq/trainers/trainer.py new file mode 100644 index 0000000..304e32b --- /dev/null +++ b/examples/examples_seq2seq/trainers/trainer.py @@ -0,0 +1,274 @@ +from typing import Dict, List, Optional +import numpy as np +import time +import torch +import collections +from packaging import version +from torch.utils.data.dataset import Dataset + +from transformers import Trainer +from transformers import logging +from transformers.trainer_utils import ( + speed_metrics, + EvalLoopOutput, + denumpify_detensorize +) +from transformers.file_utils import is_torch_tpu_available +from transformers.trainer_pt_utils import ( + find_batch_size, + nested_numpify, + nested_truncate, + nested_concat, + IterableDatasetShard +) +from .trainer_utils import EvalPrediction + + +from torch.utils.data.dataloader import DataLoader +from torch.utils.data.dataset import IterableDataset +from transformers.deepspeed import deepspeed_init + + +if version.parse(torch.__version__) >= version.parse("1.6"): + from torch.cuda.amp import autocast + +if is_torch_tpu_available(): + import torch_xla.core.xla_model as xm + import torch_xla.debug.metrics as met + import torch_xla.distributed.parallel_loader as pl + +logger = logging.get_logger(__name__) + +class BaseTrainer(Trainer): + def __init__(self, evaluation_metrics=[], data_info=None, *args, **kwargs): + """When doing evaluation, it computes average of list of metrics + given in evaluation_metrics and adds it to the dictionary of results. + Trainer class then use this average metric to save the best model.""" + super().__init__(*args, **kwargs) + self.evaluation_metrics = evaluation_metrics + self.data_info = data_info + + def get_data_info(self, metric_key_prefix): + """Returns the data information required to make the predictions/labels + suitable for the evaluation.""" + if self.data_info is not None: + return self.data_info[metric_key_prefix] + return None + + def evaluate( + self, + eval_dataset: Optional[Dataset] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + ) -> Dict[str, float]: + """ + Run evaluation and returns metrics. + The calling script will be responsible for providing a method to compute metrics, as they are task-dependent + (pass it to the init :obj:`compute_metrics` argument). + You can also subclass and override this method to inject custom behavior. + Args: + eval_dataset (:obj:`Dataset`, `optional`): + Pass a dataset if you wish to override :obj:`self.eval_dataset`. If it is an :obj:`datasets.Dataset`, + columns not accepted by the ``model.forward()`` method are automatically removed. It must implement the + :obj:`__len__` method. + ignore_keys (:obj:`Lst[str]`, `optional`): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + metric_key_prefix (:obj:`str`, `optional`, defaults to :obj:`"eval"`): + An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named + "eval_bleu" if the prefix is "eval" (default) + Returns: + A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The + dictionary also contains the epoch number which comes from the training state. + """ + # memory metrics - must set up as early as possible + self._memory_tracker.start() + eval_dataloader = self.get_eval_dataloader(eval_dataset) + start_time = time.time() + eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop + output = eval_loop( + eval_dataloader, + description="Evaluation", + # No point gathering the predictions if there are no metrics, otherwise we defer to + # self.args.prediction_loss_only + prediction_loss_only=True if self.compute_metrics is None else None, + ignore_keys=ignore_keys, + metric_key_prefix=metric_key_prefix, + ) + output.metrics.update(speed_metrics(metric_key_prefix, start_time, output.num_samples)) + if len(self.evaluation_metrics) != 0: + selected_metrics = [output.metrics[metric_key_prefix+"_"+k] for k in self.evaluation_metrics if metric_key_prefix+"_"+k in output.metrics] + assert len(selected_metrics) >= 1, "at least one metric should be selected to compute the average_metrics." + output.metrics.update({metric_key_prefix+'_average_metrics': np.mean(selected_metrics)}) + + self.log(output.metrics) + + if self.args.tpu_metrics_debug or self.args.debug: + # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) + xm.master_print(met.metrics_report()) + + self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics) + self._memory_tracker.stop_and_update_metrics(output.metrics) + return output.metrics + + def evaluation_loop( + self, + dataloader: DataLoader, + description: str, + prediction_loss_only: Optional[bool] = None, + ignore_keys: Optional[List[str]] = None, + metric_key_prefix: str = "eval", + ) -> EvalLoopOutput: + """ + Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`. + + Works both with or without labels. + """ + prediction_loss_only = ( + prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only + ) + + # if eval is called w/o train init deepspeed here + if self.args.deepspeed and not self.deepspeed: + + # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval + # from the checkpoint eventually + deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None) + self.model = deepspeed_engine.module + self.model_wrapped = deepspeed_engine + self.deepspeed = deepspeed_engine + # XXX: we don't need optim/sched for inference, but this needs to be sorted out, since + # for example the Z3-optimizer is a must for zero3 to work even for inference - what we + # don't need is the deepspeed basic optimizer which is self.optimizer.optimizer + deepspeed_engine.optimizer.optimizer = None + deepspeed_engine.lr_scheduler = None + + model = self._wrap_model(self.model, training=False) + + # if full fp16 is wanted on eval and this ``evaluation`` or ``predict`` isn't called while + # ``train`` is running, halve it first and then put on device + if not self.is_in_train and self.args.fp16_full_eval: + model = model.half().to(self.args.device) + + batch_size = dataloader.batch_size + + logger.info(f"***** Running {description} *****") + if isinstance(dataloader.dataset, collections.abc.Sized): + logger.info(f" Num examples = {self.num_examples(dataloader)}") + else: + logger.info(" Num examples: Unknown") + logger.info(f" Batch size = {batch_size}") + + model.eval() + + self.callback_handler.eval_dataloader = dataloader + # Do this before wrapping. + eval_dataset = dataloader.dataset + + if is_torch_tpu_available(): + dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) + + if self.args.past_index >= 0: + self._past = None + + # Initialize containers + # losses/preds/labels on GPU/TPU (accumulated for eval_accumulation_steps) + losses_host = None + preds_host = None + labels_host = None + # losses/preds/labels on CPU (final containers) + all_losses = None + all_preds = None + all_labels = None + # Will be useful when we have an iterable dataset so don't know its length. + + observed_num_examples = 0 + # Main evaluation loop + for step, inputs in enumerate(dataloader): + # Update the observed num examples + observed_batch_size = find_batch_size(inputs) + if observed_batch_size is not None: + observed_num_examples += observed_batch_size + + # Prediction step + loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) + # Update containers on host + if loss is not None: + losses = self._nested_gather(loss.repeat(batch_size)) + losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0) + if logits is not None: + logits = self._pad_across_processes(logits) + logits = self._nested_gather(logits) + preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) + if labels is not None: + labels = self._pad_across_processes(labels) + labels = self._nested_gather(labels) + labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) + self.control = self.callback_handler.on_prediction_step(self.args, self.state, self.control) + + # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. + if self.args.eval_accumulation_steps is not None and (step + 1) % self.args.eval_accumulation_steps == 0: + if losses_host is not None: + losses = nested_numpify(losses_host) + all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) + if preds_host is not None: + logits = nested_numpify(preds_host) + all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) + if labels_host is not None: + labels = nested_numpify(labels_host) + all_labels = ( + labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) + ) + + # Set back to None to begin a new accumulation + losses_host, preds_host, labels_host = None, None, None + + if self.args.past_index and hasattr(self, "_past"): + # Clean the state at the end of the evaluation loop + delattr(self, "_past") + + # Gather all remaining tensors and put them back on the CPU + if losses_host is not None: + losses = nested_numpify(losses_host) + all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0) + if preds_host is not None: + logits = nested_numpify(preds_host) + all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100) + if labels_host is not None: + labels = nested_numpify(labels_host) + all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100) + + # Number of samples + if not isinstance(eval_dataset, IterableDataset): + num_samples = len(eval_dataset) + elif isinstance(eval_dataset, IterableDatasetShard): + num_samples = eval_dataset.num_examples + else: + num_samples = observed_num_examples + + # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of + # samplers has been rounded to a multiple of batch_size, so we truncate. + if all_losses is not None: + all_losses = all_losses[:num_samples] + if all_preds is not None: + all_preds = nested_truncate(all_preds, num_samples) + if all_labels is not None: + all_labels = nested_truncate(all_labels, num_samples) + # Metrics! + if self.compute_metrics is not None and all_preds is not None and all_labels is not None: + metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels, + data_info=self.get_data_info(metric_key_prefix))) + else: + metrics = {} + + # To be JSON-serializable, we need to remove numpy types or zero-d tensors + metrics = denumpify_detensorize(metrics) + + if all_losses is not None: + metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item() + + # Prefix all keys with metric_key_prefix + '_' + for key in list(metrics.keys()): + if not key.startswith(f"{metric_key_prefix}_"): + metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples) diff --git a/examples/examples_seq2seq/trainers/trainer_args.py b/examples/examples_seq2seq/trainers/trainer_args.py new file mode 100644 index 0000000..4f30e79 --- /dev/null +++ b/examples/examples_seq2seq/trainers/trainer_args.py @@ -0,0 +1,140 @@ +from dataclasses import dataclass, field +from typing import Optional, List +from transformers import Seq2SeqTrainingArguments +# run_seq2seq parameters. + +@dataclass +class TrainingArguments(Seq2SeqTrainingArguments): + print_num_parameters: Optional[bool] = field(default=False, metadata={"help": "If set, print the parameters of " + "the model."}) + do_test: Optional[bool] = field(default=False, metadata={"help": "If set, evaluates the test performance."}) + split_validation_test: Optional[bool] = field(default=False, + metadata={"help": "If set, for the datasets which do not" + "have the test set, we use validation set as their" + "test set and make a validation set from either" + "splitting the validation set into half (for smaller" + "than 10K samples datasets), or by using 1K examples" + "from training set as validation set (for larger" + " datasets)."}) + compute_time: Optional[bool] = field(default=False, metadata={"help": "If set measures the time."}) + compute_memory: Optional[bool] = field(default=False, metadata={"help": "if set, measures the memory"}) + # prefix_length: Optional[int] = field(default=100, metadata={"help": "Defines the length for prefix tuning."}) + + + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + task_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + eval_dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the evaluation dataset to use (via the datasets library)."} + ) + eval_dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the evaluation dataset to use (via the datasets library)."} + ) + test_dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the test dataset to use (via the datasets library)."} + ) + test_dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the test dataset to use (via the datasets library)."} + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + max_source_length: Optional[int] = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + max_target_length: Optional[int] = field( + default=128, + metadata={ + "help": "The maximum total sequence length for target text after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + val_max_target_length: Optional[int] = field( + default=None, + metadata={ + "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`." + "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " + "during ``evaluate`` and ``predict``." + }, + ) + test_max_target_length: Optional[int] = field( + default=None, + metadata={ + "help": "The maximum total sequence length for test target text after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`." + "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used " + "during ``evaluate`` and ``predict``." + }, + ) + pad_to_max_length: bool = field( + default=False, + metadata={ + "help": "Whether to pad all samples to model maximum sentence length. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " + "efficient on GPU but very bad for TPU." + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_val_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of validation examples to this " + "value if set." + }, + ) + max_test_samples: Optional[int] = field( + default=None, + metadata={"help": "For debugging purposes or quicker training, truncate the number of test examples to this " + "value if set."} + ) + num_beams: Optional[int] = field(default=None, metadata={"help": "Number of beams to use for evaluation."}) + ignore_pad_token_for_loss: bool = field( + default=True, + metadata={ + "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not." + }, + ) + task_adapters: Optional[List[str]] = field( + default=None, + metadata={"help": "Defines a dictionary from task adapters to the tasks."} + ) + task_embeddings: Optional[List[str]] = field( + default=None, + metadata={"help": "Defines a dictionary from tasks to the tasks embeddings."} + ) + data_seed: Optional[int] = field(default=42, metadata={"help": "seed used to shuffle the data."}) + + model_parallel: Optional[bool] = field(default=False, metadata={"help": "whether apply model parallelization"}) + + def __post_init__(self): + if self.task_name is None: + raise ValueError("Need either a dataset name or a training/validation file.") + if self.val_max_target_length is None: + self.val_max_target_length = self.max_target_length + if self.test_max_target_length is None: + self.test_max_target_length = self.max_target_length diff --git a/examples/examples_seq2seq/trainers/trainer_utils.py b/examples/examples_seq2seq/trainers/trainer_utils.py new file mode 100644 index 0000000..3b4b917 --- /dev/null +++ b/examples/examples_seq2seq/trainers/trainer_utils.py @@ -0,0 +1,75 @@ +import numpy as np +from typing import Union, NamedTuple, Tuple, Dict, Any +import os +import regex as re +import logging +from dataclasses import fields +import torch.nn as nn +import json + + + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +class EvalPrediction(NamedTuple): + """ + Evaluation output (always contains labels), to be used to compute metrics. + Parameters: + predictions (:obj:`np.ndarray`): Predictions of the model. + label_ids (:obj:`np.ndarray`): Targets to be matched. + data_info: (:obj:`Dict[str, Any]`): Extra dataset information, one requires + to performs the evaluation. The data_info is a dictionary with keys from + train, eval, test to specify the data_info for each split of the dataset. + """ + predictions: Union[np.ndarray, Tuple[np.ndarray]] + label_ids: np.ndarray + data_info: Dict[str, Any] + + + + + +def create_dir(output_dir): + """ + Checks whether to the output_dir already exists and creates it if not. + Args: + output_dir: path to the output_dir + """ + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + +def get_last_checkpoint(output_dir): + if os.path.exists(os.path.join(output_dir, 'pytorch_model.bin')): + return output_dir + return None + + +def pad_punctuation(text): + """Re-implementation of _pad_punctuation in t5. This function adds spaces + around punctuation. While this pads punctuation as expected, it has the + unexpected effected of padding certain unicode characters with accents, with + spaces as well. For instance: "François" becomes "Fran ç ois""" + # Pad everything except for: underscores (_), whitespace (\s), + # numbers (\p{N}), letters (\p{L}) and accent characters (\p{M}). + text = re.sub(r'([^_\s\p{N}\p{L}\p{M}])', r' \1 ', text) + # Collapse consecutive whitespace into one space. + text = re.sub(r'\s+', ' ', text) + return text + +def save_json(filepath, dictionary): + with open(filepath, "w") as outfile: + json.dump(dictionary, outfile) + + +def read_json(filepath): + f = open(filepath,) + return json.load(f) + + +def save_training_config(config_file, output_dir): + json_data = read_json(config_file) + save_json(os.path.join(output_dir, "training_config.json"), json_data) + diff --git a/examples/examples_seq2seq/utils/__init__.py b/examples/examples_seq2seq/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/examples_seq2seq/utils/utils.py b/examples/examples_seq2seq/utils/utils.py new file mode 100644 index 0000000..74e3528 --- /dev/null +++ b/examples/examples_seq2seq/utils/utils.py @@ -0,0 +1,15 @@ +import os +import regex as re +import logging +from dataclasses import fields +import torch.nn as nn +import json + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + + + + + diff --git a/examples/examples_text-classification/README.md b/examples/examples_text-classification/README.md new file mode 100644 index 0000000..0d16da5 --- /dev/null +++ b/examples/examples_text-classification/README.md @@ -0,0 +1,58 @@ +# Text classification with OpenDelta +This repository contains the examples that uses OpenDelta to do text-classification in a traditional classification mode, i.e., with a classification head on top of the language model. Almost all of the training pipeline codes remain the same, except for some minimum changes to insert delta models onto the backbone model. + + +## Generating the json configuration file + +``` +python config_gen.py --job $job_name + +``` +The available job configuration (e.g., `--job lora_roberta-base`) can be seen from `config_gen.py`. You can also +create your only configuration. + + +## Run the code + +``` +python run_glue.py configs/$job_name/$dataset.json +``` + + +## Possible Errors + +1. +``` +ValueError: You must login to the Hugging Face hub on this computer by typing `transformers-cli login` and entering your credentials to use `use_auth_token=Tr +ue`. Alternatively, you can pass your own token as the `use_auth_token` argument. +``` +- Solution 1: Please register an account on [HuggingFace](https://huggingface.co/) +Then run transformers-cli login on your command line to enter the username and password. + +- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False + +2. +``` +OSError: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once). +``` + +- Solution 1: +``` +wget -P ~ https://github.com/git-lfs/git-lfs/releases/download/v3.0.2/git-lfs-linux-amd64-v3.0.2.tar.gz +cd ~ +tar -xvzf git-lfs-linux-amd64-v3.0.2.tar.gz +export PATH=~:$PATH # a temperary fix. To permantly add, modify your bash +git-lfs install +``` + +- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False + +3. dataset connection error + +Solution 1: open a python console, running the error command again, may not be useful + +Solution 2: download the dataset by yourself on a internect connected machine, saved to disk and transfer to your server, at last load_from_disk. + + +## Link to the original training scripts +This example repo is based on the [huggingface text-classification example](https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-classification). Thanks to the authors of the original repo. diff --git a/examples/examples_text-classification/configs/config_gen.py b/examples/examples_text-classification/configs/config_gen.py new file mode 100644 index 0000000..14101ce --- /dev/null +++ b/examples/examples_text-classification/configs/config_gen.py @@ -0,0 +1,342 @@ +import collections +import copy + +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['roberta-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "metric_for_best_model"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ["eval_accuracy"] *15, + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "roberta-base", + "tokenizer_name": "roberta-base", + "save_total_limit": 1, + # For glue datasets. + # "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + # "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + + +BaseConfigs['deberta-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "metric_for_best_model"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ["eval_accuracy"] *15, + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "microsoft/deberta-v3-base", + "tokenizer_name": "microsoft/deberta-v3-base", + "save_total_limit": 1, + # For glue datasets. + # "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + # "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +BaseConfigs['deberta-v2-xlarge'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "metric_for_best_model", "gradient_accumulation_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 16, 16, 16, 16, 16, 8, 16] + [16] * 8, + [ 16, 16, 16, 16, 16, 8, 16] + [16] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ["eval_accuracy"] *15, + [4] *15, + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "microsoft/deberta-v2-xlarge", + "tokenizer_name": "microsoft/deberta-v2-xlarge", + "save_total_limit": 1, + # For glue datasets. + # "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + # "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + + +AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['bitfit_roberta-base'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/roberta-base/", + "unfrozen_modules": [ + "classifier", + "deltas" + ], + }) + +AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['adapter_roberta-base'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier", + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/roberta-base/", + }) + +AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['lora_roberta-base'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier", + ], + "lora_r": 8, + "output_dir": "outputs/lora/roberta-base/", + }) + +AllConfigs['compacter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['compacter_roberta-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier", + ], + "output_dir": "outputs/compacter/roberta-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['compacter++_roberta-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier", + ], + "output_dir": "outputs/compacter++/roberta-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['low_rank_adapter_roberta-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier", + ], + "output_dir": "outputs/low_rank_adapter/roberta-base/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['soft_prompt_roberta-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "unfrozen_modules": [ + "deltas", + "classifier", + ], + "output_dir": "outputs/soft_prompt/roberta-base/", + }) + +AllConfigs['prefix_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['prefix_roberta-base'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "classifier", + ], + "output_dir": "outputs/prefix/roberta-base/", + }) + +AllConfigs['soft_prompt_deberta-v2-xlarge'] = copy.deepcopy(BaseConfigs['deberta-v2-xlarge']) +AllConfigs['soft_prompt_deberta-v2-xlarge'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "unfrozen_modules": [ + "deltas", + "classifier", + ], + "output_dir": "outputs/soft_prompt/deberta-v2-xlarge/", + }) + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"./{args.job}/"): + os.mkdir(f"./{args.job}/") + + for job_name in all_config_jsons: + with open(f"./{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + + \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/cola.json b/examples/examples_text-classification/configs/prefix_roberta-base/cola.json new file mode 100644 index 0000000..eafe735 --- /dev/null +++ b/examples/examples_text-classification/configs/prefix_roberta-base/cola.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "cola", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/cola", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "cola", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "cola", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/mnli.json b/examples/examples_text-classification/configs/prefix_roberta-base/mnli.json new file mode 100644 index 0000000..d134b54 --- /dev/null +++ b/examples/examples_text-classification/configs/prefix_roberta-base/mnli.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/prefix/roberta-base/mnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "mnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/mrpc.json b/examples/examples_text-classification/configs/prefix_roberta-base/mrpc.json new file mode 100644 index 0000000..8a96090 --- /dev/null +++ b/examples/examples_text-classification/configs/prefix_roberta-base/mrpc.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mrpc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/mrpc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "mrpc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mrpc", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/qnli.json b/examples/examples_text-classification/configs/prefix_roberta-base/qnli.json new file mode 100644 index 0000000..3e5142e --- /dev/null +++ b/examples/examples_text-classification/configs/prefix_roberta-base/qnli.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/prefix/roberta-base/qnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "qnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/qqp.json b/examples/examples_text-classification/configs/prefix_roberta-base/qqp.json new file mode 100644 index 0000000..d36f69a --- /dev/null +++ b/examples/examples_text-classification/configs/prefix_roberta-base/qqp.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qqp", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/prefix/roberta-base/qqp", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "qqp", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qqp", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/rte.json b/examples/examples_text-classification/configs/prefix_roberta-base/rte.json new file mode 100644 index 0000000..d6c8470 --- /dev/null +++ b/examples/examples_text-classification/configs/prefix_roberta-base/rte.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/sst2.json b/examples/examples_text-classification/configs/prefix_roberta-base/sst2.json new file mode 100644 index 0000000..a583cce --- /dev/null +++ b/examples/examples_text-classification/configs/prefix_roberta-base/sst2.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "sst2", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/prefix/roberta-base/sst2", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "sst2", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "sst2", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/stsb.json b/examples/examples_text-classification/configs/prefix_roberta-base/stsb.json new file mode 100644 index 0000000..63dd100 --- /dev/null +++ b/examples/examples_text-classification/configs/prefix_roberta-base/stsb.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "stsb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/stsb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "stsb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "stsb", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-boolq.json b/examples/examples_text-classification/configs/prefix_roberta-base/superglue-boolq.json new file mode 100644 index 0000000..013892a --- /dev/null +++ b/examples/examples_text-classification/configs/prefix_roberta-base/superglue-boolq.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-cb.json b/examples/examples_text-classification/configs/prefix_roberta-base/superglue-cb.json new file mode 100644 index 0000000..4513356 --- /dev/null +++ b/examples/examples_text-classification/configs/prefix_roberta-base/superglue-cb.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-cb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/superglue-cb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-cb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-cb", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-copa.json b/examples/examples_text-classification/configs/prefix_roberta-base/superglue-copa.json new file mode 100644 index 0000000..59d4f70 --- /dev/null +++ b/examples/examples_text-classification/configs/prefix_roberta-base/superglue-copa.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-copa", + "eval_steps": 50, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 40, + "output_dir": "outputs/prefix/roberta-base/superglue-copa", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 50, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-copa", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-copa", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-multirc.json b/examples/examples_text-classification/configs/prefix_roberta-base/superglue-multirc.json new file mode 100644 index 0000000..3ea0c77 --- /dev/null +++ b/examples/examples_text-classification/configs/prefix_roberta-base/superglue-multirc.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-multirc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/prefix/roberta-base/superglue-multirc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-multirc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-multirc", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-record.json b/examples/examples_text-classification/configs/prefix_roberta-base/superglue-record.json new file mode 100644 index 0000000..1251019 --- /dev/null +++ b/examples/examples_text-classification/configs/prefix_roberta-base/superglue-record.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-record", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 512, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/prefix/roberta-base/superglue-record", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-record", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-record", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-wic.json b/examples/examples_text-classification/configs/prefix_roberta-base/superglue-wic.json new file mode 100644 index 0000000..e5d9f12 --- /dev/null +++ b/examples/examples_text-classification/configs/prefix_roberta-base/superglue-wic.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wic", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/superglue-wic", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-wic", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wic", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-wsc.fixed.json b/examples/examples_text-classification/configs/prefix_roberta-base/superglue-wsc.fixed.json new file mode 100644 index 0000000..ee224df --- /dev/null +++ b/examples/examples_text-classification/configs/prefix_roberta-base/superglue-wsc.fixed.json @@ -0,0 +1,43 @@ +{ + "dataset_config_name": [ + "en" + ], + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wsc.fixed", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/roberta-base/superglue-wsc.fixed", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-wsc.fixed", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wsc.fixed", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/metrics/glue.py b/examples/examples_text-classification/metrics/glue.py new file mode 100644 index 0000000..ffd0fc1 --- /dev/null +++ b/examples/examples_text-classification/metrics/glue.py @@ -0,0 +1,156 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" GLUE benchmark metric. """ + +from scipy.stats import pearsonr, spearmanr +from sklearn.metrics import f1_score, matthews_corrcoef + +import datasets + + +_CITATION = """\ +@inproceedings{wang2019glue, + title={{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding}, + author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.}, + note={In the Proceedings of ICLR.}, + year={2019} +} +""" + +_DESCRIPTION = """\ +GLUE, the General Language Understanding Evaluation benchmark +(https://gluebenchmark.com/) is a collection of resources for training, +evaluating, and analyzing natural language understanding systems. +""" + +_KWARGS_DESCRIPTION = """ +Compute GLUE evaluation metric associated to each GLUE dataset. +Args: + predictions: list of predictions to score. + Each translation should be tokenized into a list of tokens. + references: list of lists of references for each translation. + Each reference should be tokenized into a list of tokens. +Returns: depending on the GLUE subset, one or several of: + "accuracy": Accuracy + "f1": F1 score + "pearson": Pearson Correlation + "spearmanr": Spearman Correlation + "matthews_correlation": Matthew Correlation +Examples: + + >>> glue_metric = datasets.load_metric('glue', 'sst2') # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"] + >>> references = [0, 1] + >>> predictions = [0, 1] + >>> results = glue_metric.compute(predictions=predictions, references=references) + >>> print(results) + {'accuracy': 1.0} + + >>> glue_metric = datasets.load_metric('glue', 'mrpc') # 'mrpc' or 'qqp' + >>> references = [0, 1] + >>> predictions = [0, 1] + >>> results = glue_metric.compute(predictions=predictions, references=references) + >>> print(results) + {'accuracy': 1.0, 'f1': 1.0} + + >>> glue_metric = datasets.load_metric('glue', 'stsb') + >>> references = [0., 1., 2., 3., 4., 5.] + >>> predictions = [0., 1., 2., 3., 4., 5.] + >>> results = glue_metric.compute(predictions=predictions, references=references) + >>> print({"pearson": round(results["pearson"], 2), "spearmanr": round(results["spearmanr"], 2)}) + {'pearson': 1.0, 'spearmanr': 1.0} + + >>> glue_metric = datasets.load_metric('glue', 'cola') + >>> references = [0, 1] + >>> predictions = [0, 1] + >>> results = glue_metric.compute(predictions=predictions, references=references) + >>> print(results) + {'matthews_correlation': 1.0} +""" + + +def simple_accuracy(preds, labels): + return float((preds == labels).mean()) + + +def acc_and_f1(preds, labels): + acc = simple_accuracy(preds, labels) + f1 = float(f1_score(y_true=labels, y_pred=preds)) + return { + "accuracy": acc, + "f1": f1, + } + + +def pearson_and_spearman(preds, labels): + pearson_corr = float(pearsonr(preds, labels)[0]) + spearman_corr = float(spearmanr(preds, labels)[0]) + return { + "pearson": pearson_corr, + "spearmanr": spearman_corr, + } + + +@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class Glue(datasets.Metric): + def _info(self): + if self.config_name not in [ + "sst2", + "mnli", + "mnli_mismatched", + "mnli_matched", + "cola", + "stsb", + "mrpc", + "qqp", + "qnli", + "rte", + "wnli", + "hans", + ]: + raise KeyError( + "You should supply a configuration name selected in " + '["sst2", "mnli", "mnli_mismatched", "mnli_matched", ' + '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]' + ) + return datasets.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features( + { + "predictions": datasets.Value("int64" if self.config_name != "stsb" else "float32"), + "references": datasets.Value("int64" if self.config_name != "stsb" else "float32"), + } + ), + codebase_urls=[], + reference_urls=[], + format="numpy", + ) + + def _compute(self, predictions, references): + if self.config_name == "cola": + return {"matthews_correlation": matthews_corrcoef(references, predictions)} + elif self.config_name == "stsb": + return pearson_and_spearman(predictions, references) + elif self.config_name in ["mrpc", "qqp"]: + return acc_and_f1(predictions, references) + elif self.config_name in ["sst2", "mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]: + return {"accuracy": simple_accuracy(predictions, references)} + else: + raise KeyError( + "You should supply a configuration name selected in " + '["sst2", "mnli", "mnli_mismatched", "mnli_matched", ' + '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]' + ) \ No newline at end of file diff --git a/examples/examples_text-classification/requirements.txt b/examples/examples_text-classification/requirements.txt new file mode 100644 index 0000000..8d8ff7a --- /dev/null +++ b/examples/examples_text-classification/requirements.txt @@ -0,0 +1,8 @@ +accelerate +datasets >= 1.8.0 +sentencepiece != 0.1.92 +scipy +scikit-learn +protobuf +torch >= 1.3 +argunparse \ No newline at end of file diff --git a/examples/examples_text-classification/run.sh b/examples/examples_text-classification/run.sh new file mode 100644 index 0000000..e7363d6 --- /dev/null +++ b/examples/examples_text-classification/run.sh @@ -0,0 +1,7 @@ +files=(cola mnli mrpc qnli qqp rte sst2 stsb superglue-boolq superglue-cb superglue-copa superglue-multirc superglue-record superglue-wic superglue-wsc.fixed) +for ((i=$1; i<=$2; i++)) +do + dataset=${files[i]} + echo "id$i:$dataset" + TOKENIZERS_PARALLELISM=false python run_glue.py configs/$3/$dataset.json +done \ No newline at end of file diff --git a/examples/examples_text-classification/run_glue.py b/examples/examples_text-classification/run_glue.py new file mode 100755 index 0000000..9ca0477 --- /dev/null +++ b/examples/examples_text-classification/run_glue.py @@ -0,0 +1,632 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Finetuning the library models for sequence classification on GLUE.""" +# You can also adapt this script on your own text classification task. Pointers for this are left as comments. + +import argparse +import dataclasses +import json +import logging +import os +from pathlib import Path +import random +import re +import sys +from dataclasses import dataclass, field +from typing import Optional + +import datasets +import numpy as np +from datasets import load_dataset, load_metric +from opendelta.utils.delta_hub import create_hub_repo_name + +import transformers +from transformers import ( + AutoConfig, + AutoModelForSequenceClassification, + AutoTokenizer, + DataCollatorWithPadding, + EvalPrediction, + HfArgumentParser, + PretrainedConfig, + # Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.trainer import Trainer + +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +# check_min_version("4.16.0.dev0") + +require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") + +task_to_keys = { + "cola": ("sentence", None), + "mnli": ("premise", "hypothesis"), + "mrpc": ("sentence1", "sentence2"), + "qnli": ("question", "sentence"), + "qqp": ("question1", "question2"), + "rte": ("sentence1", "sentence2"), + "sst2": ("sentence", None), + "stsb": ("sentence1", "sentence2"), + "wnli": ("sentence1", "sentence2"), +} + +logger = logging.getLogger(__name__) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + + Using `HfArgumentParser` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + task_name: Optional[str] = field( + default=None, + metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())}, + ) + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + max_seq_length: int = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} + ) + pad_to_max_length: bool = field( + default=True, + metadata={ + "help": "Whether to pad all samples to `max_seq_length`. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch." + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + max_predict_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " + "value if set." + }, + ) + train_file: Optional[str] = field( + default=None, metadata={"help": "A csv or a json file containing the training data."} + ) + validation_file: Optional[str] = field( + default=None, metadata={"help": "A csv or a json file containing the validation data."} + ) + test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."}) + + def __post_init__(self): + if self.task_name is not None: + self.task_name = self.task_name.lower() + if self.task_name not in task_to_keys.keys(): + raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys())) + elif self.dataset_name is not None: + pass + elif self.train_file is None or self.validation_file is None: + raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.") + else: + train_extension = self.train_file.split(".")[-1] + assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file." + validation_extension = self.validation_file.split(".")[-1] + assert ( + validation_extension == train_extension + ), "`validation_file` should have the same extension (csv or json) as `train_file`." + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + +class RemainArgHfArgumentParser(HfArgumentParser): + def parse_json_file(self, json_file: str, return_remaining_args=True ): + """ + Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the + dataclass types. + """ + data = json.loads(Path(json_file).read_text()) + outputs = [] + for dtype in self.dataclass_types: + keys = {f.name for f in dataclasses.fields(dtype) if f.init} + inputs = {k: data.pop(k) for k in list(data.keys()) if k in keys} + obj = dtype(**inputs) + outputs.append(obj) + + remain_args = argparse.ArgumentParser() + remain_args.__dict__.update(data) + if return_remaining_args: + return (*outputs, remain_args) + else: + return (*outputs,) + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + json_file=os.path.abspath(sys.argv[1]) + model_args, data_args, training_args, delta_args = parser.parse_json_file(json_file, return_remaining_args=True) #args = arg_string, return_remaining_strings=True) #parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args, delta_args = parser.parse_args_into_dataclasses() + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) + # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the + # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named + # label if at least two columns are provided. + # + # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this + # single column. You can easily tweak this behavior (see below) + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.task_name is not None: + # Downloading and loading a dataset from the hub. + + raw_datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir) + # if you encounter error here + # download the dataset, save to disk and then load_from_disk + # from datasets import load_from_disk + # raw_datasets = load_from_disk(f"../../../../huggingface_datasets/saved_to_disk/glue.{data_args.task_name}") + + elif data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + ) + else: + # Loading a dataset from your local files. + # CSV/JSON training and evaluation files are needed. + data_files = {"train": data_args.train_file, "validation": data_args.validation_file} + + # Get the test dataset: you can provide your own CSV/JSON test file (see below) + # when you use `do_predict` without specifying a GLUE benchmark task. + if training_args.do_predict: + if data_args.test_file is not None: + train_extension = data_args.train_file.split(".")[-1] + test_extension = data_args.test_file.split(".")[-1] + assert ( + test_extension == train_extension + ), "`test_file` should have the same extension (csv or json) as `train_file`." + data_files["test"] = data_args.test_file + else: + raise ValueError("Need either a GLUE task or a test file for `do_predict`.") + + for key in data_files.keys(): + logger.info(f"load a local file for {key}: {data_files[key]}") + + if data_args.train_file.endswith(".csv"): + # Loading a dataset from local csv files + raw_datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) + else: + # Loading a dataset from local json files + raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) + # See more about loading any type of standard or custom dataset at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Labels + if data_args.task_name is not None: + is_regression = data_args.task_name == "stsb" + if not is_regression: + label_list = raw_datasets["train"].features["label"].names + num_labels = len(label_list) + else: + num_labels = 1 + else: + # Trying to have good defaults here, don't hesitate to tweak to your needs. + is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"] + if is_regression: + num_labels = 1 + else: + # A useful fast method: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique + label_list = raw_datasets["train"].unique("label") + label_list.sort() # Let's sort it for determinism + num_labels = len(label_list) + + # Load pretrained model and tokenizer + # + # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + num_labels=num_labels, + finetuning_task=data_args.task_name, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + model = AutoModelForSequenceClassification.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + if delta_args.delta_type.lower() != "none": + from opendelta import AutoDeltaConfig + from opendelta.auto_delta import AutoDeltaModel + delta_config = AutoDeltaConfig.from_dict(vars(delta_args)) + delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=model) + delta_model.freeze_module(set_state_dict = True) + delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True) + + + + + + + + + + + # Preprocessing the raw_datasets + if data_args.task_name is not None: + sentence1_key, sentence2_key = task_to_keys[data_args.task_name] + else: + # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. + non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"] + if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: + sentence1_key, sentence2_key = "sentence1", "sentence2" + else: + if len(non_label_column_names) >= 2: + sentence1_key, sentence2_key = non_label_column_names[:2] + else: + sentence1_key, sentence2_key = non_label_column_names[0], None + + # Padding strategy + if data_args.pad_to_max_length: + padding = "max_length" + else: + # We will pad later, dynamically at batch creation, to the max sequence length in each batch + padding = False + + # Some models have set the order of the labels to use, so let's make sure we do use it. + label_to_id = None + if ( + model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id + and data_args.task_name is not None + and not is_regression + ): + # Some have all caps in their config, some don't. + label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} + if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): + label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)} + else: + logger.warning( + "Your model seems to have been trained with labels, but they don't match the dataset: ", + f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." + "\nIgnoring the model labels as a result.", + ) + elif data_args.task_name is None and not is_regression: + label_to_id = {v: i for i, v in enumerate(label_list)} + + if label_to_id is not None: + model.config.label2id = label_to_id + model.config.id2label = {id: label for label, id in config.label2id.items()} + elif data_args.task_name is not None and not is_regression: + model.config.label2id = {l: i for i, l in enumerate(label_list)} + model.config.id2label = {id: label for label, id in config.label2id.items()} + + if data_args.max_seq_length > tokenizer.model_max_length: + logger.warning( + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." + ) + max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) + + def preprocess_function(examples): + # Tokenize the texts + args = ( + (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) + ) + result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) + + # Map labels to IDs (not necessary for GLUE tasks) + if label_to_id is not None and "label" in examples: + result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] + return result + + with training_args.main_process_first(desc="dataset map pre-processing"): + raw_datasets = raw_datasets.map( + preprocess_function, + batched=True, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + if training_args.do_train: + if "train" not in raw_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = raw_datasets["train"] + if data_args.max_train_samples is not None: + train_dataset = train_dataset.select(range(data_args.max_train_samples)) + + if training_args.do_eval: + if "validation" not in raw_datasets and "validation_matched" not in raw_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] + if data_args.max_eval_samples is not None: + eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + + if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None: + if "test" not in raw_datasets and "test_matched" not in raw_datasets: + raise ValueError("--do_predict requires a test dataset") + predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"] + if data_args.max_predict_samples is not None: + predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) + + # Log a few random samples from the training set: + if training_args.do_train: + for index in random.sample(range(len(train_dataset)), 3): + logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") + + # Get the metric function + if data_args.task_name is not None: + # metric = load_metric("glue", data_args.task_name) + metric = load_metric("./metrics/glue.py", data_args.task_name) + else: + metric = load_metric("accuracy") + + # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a + # predictions and label_ids field) and has to return a dictionary string to float. + def compute_metrics(p: EvalPrediction): + preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions + preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) + if data_args.task_name is not None: + result = metric.compute(predictions=preds, references=p.label_ids) + if len(result) > 1: + result["combined_score"] = np.mean(list(result.values())).item() + return result + elif is_regression: + return {"mse": ((preds - p.label_ids) ** 2).mean().item()} + else: + return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} + + # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. + if data_args.pad_to_max_length: + data_collator = default_data_collator + elif training_args.fp16: + data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) + else: + data_collator = None + + # Initialize our Trainer + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + compute_metrics=compute_metrics, + tokenizer=tokenizer, + data_collator=data_collator, + ) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + metrics = train_result.metrics + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.save_model() # Saves the tokenizer too for easy upload + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + results = {} + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + # Loop to handle MNLI double evaluation (matched, mis-matched) + tasks = [data_args.task_name] + eval_datasets = [eval_dataset] + if data_args.task_name == "mnli": + tasks.append("mnli-mm") + eval_datasets.append(raw_datasets["validation_mismatched"]) + + for eval_dataset, task in zip(eval_datasets, tasks): + metrics = trainer.evaluate(eval_dataset=eval_dataset) + + max_eval_samples = ( + data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + ) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + results['eval'] = metrics + + if training_args.do_predict: + logger.info("*** Predict ***") + + # Loop to handle MNLI double evaluation (matched, mis-matched) + tasks = [data_args.task_name] + predict_datasets = [predict_dataset] + if data_args.task_name == "mnli": + tasks.append("mnli-mm") + predict_datasets.append(raw_datasets["test_mismatched"]) + + for predict_dataset, task in zip(predict_datasets, tasks): + # Removing the `label` columns because it contains -1 and Trainer won't like that. + predict_dataset = predict_dataset.remove_columns("label") + predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions + predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) + + output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt") + if trainer.is_world_process_zero(): + with open(output_predict_file, "w") as writer: + logger.info(f"***** Predict results {task} *****") + writer.write("index\tprediction\n") + for index, item in enumerate(predictions): + if is_regression: + writer.write(f"{index}\t{item:3.3f}\n") + else: + item = label_list[item] + writer.write(f"{index}\t{item}\n") + + + # kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"} + # if data_args.task_name is not None: + # kwargs["language"] = "en" + # kwargs["dataset_tags"] = "glue" + # kwargs["dataset_args"] = data_args.task_name + # kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}" + # kwargs["delta_type"] = delta_args.delta_type + + repo_name = create_hub_repo_name(root="DeltaHub", + dataset=data_args.task_name, + delta_type = delta_args.delta_type, + model_name_or_path= model_args.model_name_or_path) + + if training_args.push_to_hub: # TODO add description here + delta_model.save_finetuned(push_to_hub=True, save_directory=repo_name, use_auth_token=True) + # trainer.push_to_hub(**kwargs) + else: + delta_model.save_finetuned(push_to_hub=False, save_directory=repo_name, use_auth_token=True) + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/examples/examples_text-classification/util.py b/examples/examples_text-classification/util.py new file mode 100644 index 0000000..50393e0 --- /dev/null +++ b/examples/examples_text-classification/util.py @@ -0,0 +1,75 @@ +from datasets import load_dataset, load_metric +import torch +import logging + + +logger = logging.getLogger(__name__) + + +class DataLoader: + small_datasets_without_all_splits = ["cola", "wnli", "rte", "superglue-cb", "superglue-copa", "superglue-multirc", + "superglue-wic", "superglue-wsc.fixed", "superglue-rte", "mrpc", "stsb", + "superglue-boolq"] + large_data_without_all_splits = ["qqp", "qnli", "superglue-record", "sst2"] + + def __init__(self, raw_datasets, data_args, model_args, training_args): + self.raw_datasets = raw_datasets + self.data_args = data_args + self.model_args = model_args + self.training_args = training_args + + def shuffled_indices(self, dataset): + num_samples = len(dataset) + generator = torch.Generator() + generator.manual_seed(self.training_args.seed) + return torch.randperm(num_samples, generator=generator).tolist() + + def subsample(self, dataset, indices=None): + """ + Given a dataset returns the subsampled dataset. + :param n_obs: the number of samples of the subsampled dataset. + :param indices: indices to select the samples from, if not given, indices are computed + from by shuffling the given dataset. + :return: subsampled dataset. + """ + if indices is None: + indices = self.shuffled_indices(dataset) + return dataset.select(indices) + + def get_split_indices(self, split, dataset, validation_size): + indices = self.shuffled_indices(dataset) + if split == "validation": + return indices[:validation_size] + else: + return indices[validation_size:] + + def get(self, split): + if self.data_args.task_name == 'mnli': + if split == 'validation': + split = 'validation_mismatched' + elif split == 'test': + split = 'validation_matched' + return self.raw_datasets[split] + # For small datasets (n_samples < 10K) without test set, we divide validation set to + # half, use one half as test set and one half as validation set. + if self.data_args.task_name in self.small_datasets_without_all_splits \ + and split != "train": + logger.info("Split validation set into test and validation set.") + dataset = self.raw_datasets['validation'] + indices = self.get_split_indices(split, dataset, validation_size=len(dataset)//2) + dataset = self.subsample(dataset, indices) + # For larger datasets (n_samples > 10K), we divide training set into 1K as + # validation and the rest as training set, keeping the original validation + # set as the test set. + elif self.data_args.task_name in self.large_data_without_all_splits \ + and split != "test": + logger.info("Split training set into train and validation set, use validation set as test set.") + dataset = self.raw_datasets['train'] + indices = self.get_split_indices(split, dataset, validation_size=1000) + dataset = self.subsample(dataset, indices) + elif split == 'train': + dataset = self.raw_datasets[split] + else: + assert split == 'test', print("expected test, but got {}".format(split)) + dataset = self.raw_datasets[split] + return dataset \ No newline at end of file diff --git a/examples/setup_seq2seq.py b/examples/setup_seq2seq.py new file mode 100755 index 0000000..0aa5a64 --- /dev/null +++ b/examples/setup_seq2seq.py @@ -0,0 +1,48 @@ +"""Install Compacter.""" +import os +import setuptools +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +#os.environ['TORCH_CUDA_ARCH_LIST']="3.5;3.7;6.1;7.0;7.5;8.6+PTX" + +def setup_package(): + long_description = "examples_seq2seq" + setuptools.setup( + name='examples_seq2seq', + version='0.0.1', + description='seq2seq example', + long_description=long_description, + long_description_content_type='text/markdown', + author='Shengding Hu', + license='MIT License', + packages=setuptools.find_packages( + exclude=['docs', 'tests', 'scripts']), + dependency_links=[ + 'https://download.pytorch.org/whl/torch_stable.html', + ], + classifiers=[ + 'Intended Audience :: Developers', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: MIT License', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.7.10', + ], + keywords='text nlp machinelearning', + # ext_modules=[ + # CUDAExtension('seq2seq.projections.fwh_cuda', + # sources=[ + # 'seq2seq/projections/fwh_cuda/fwh_cpp.cpp', + # 'seq2seq/projections/fwh_cuda/fwh_cu.cu', + # ] + # ) + # ] + # , + cmdclass={"build_ext": BuildExtension}, + install_requires=[ + ], + ) + + +if __name__ == '__main__': + setup_package() diff --git a/examples/tutorial/0_basic.py b/examples/tutorial/0_basic.py new file mode 100644 index 0000000..2cfd8dd --- /dev/null +++ b/examples/tutorial/0_basic.py @@ -0,0 +1,26 @@ +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base") +# suppose we load BART + +from opendelta import Visualization +print("before modify") +Visualization(model).structure_graph() +""" +The white part is the name of the module. +The green part is the module's type. +The blue part is the tunable parameters, i.e., the parameters that require grad computation. +The grey part is the frozen parameters, i.e., the parameters that do not require grad computation. +The red part is the structure that is repeated and thus folded. +The purple part is the delta parameters inserted into the backbone model. +""" + +from opendelta import LoraModel +delta_model = LoraModel(backbone_model=model, modified_modules=['fc2']) +print("after modify") +delta_model.log() +# This will visualize the backbone after modification and other information. + +delta_model.freeze_module(exclude=["deltas", "layernorm_embedding"], set_state_dict=True) +print("after freeze") +delta_model.log() +# The set_state_dict=True will tell the method to change the state_dict of the backbone_model to maintaining only the trainable parts. diff --git a/examples/tutorial/0_interactive.py b/examples/tutorial/0_interactive.py new file mode 100644 index 0000000..c164a13 --- /dev/null +++ b/examples/tutorial/0_interactive.py @@ -0,0 +1,11 @@ +from transformers import BertForMaskedLM +model = BertForMaskedLM.from_pretrained("bert-base-cased") +# suppose we load BERT + +from opendelta import LoraModel +delta_model = LoraModel(backbone_model=model, interactive_modify=True) +# This will visualize the backbone after modification and other information. + +delta_model.freeze_module(exclude=["deltas", "layernorm_embedding"], set_state_dict=True) +delta_model.log() + diff --git a/examples/tutorial/1_with_openprompt.py b/examples/tutorial/1_with_openprompt.py new file mode 100644 index 0000000..35c39f9 --- /dev/null +++ b/examples/tutorial/1_with_openprompt.py @@ -0,0 +1,156 @@ +""" +This tutorial is a copy of OpenPrompt's tutorial/1.1_mixed_template.py +The only modification is in lines 98 to 102 + +1. OpenPrompt provides pre-processing of data, such as prompt template formatting +2. OpenPrompt pre-process the model input, such as: prompt soft embedding +3. OpenDelta modify the backbone model, such as: Adapter, Lora, Compactor, etc. +4. OpenPrompt post-process the model output, such as: extract logits at position, apply prompt verbalizer +""" + +# load dataset +from datasets import load_dataset +from datasets import load_from_disk +# raw_dataset = load_dataset('super_glue', 'cb', cache_dir="../datasets/.cache/huggingface_datasets") +raw_dataset = load_from_disk("/home/hx/huggingface_datasets/saved_to_disk/super_glue.cb") +# Note that if you are running this scripts inside a GPU cluster, there are chances are you are not able to connect to huggingface website directly. +# In this case, we recommend you to run `raw_dataset = load_dataset(...)` on some machine that have internet connections. +# Then use `raw_dataset.save_to_disk(path)` method to save to local path. +# Thirdly upload the saved content into the machiine in cluster. +# Then use `load_from_disk` method to load the dataset. + +from openprompt.data_utils import InputExample + +dataset = {} +for split in ['train', 'validation', 'test']: + dataset[split] = [] + for data in raw_dataset[split]: + input_example = InputExample(text_a = data['premise'], text_b = data['hypothesis'], label=int(data['label']), guid=data['idx']) + dataset[split].append(input_example) +print(dataset['train'][0]) + +# You can load the plm related things provided by openprompt simply by calling: +from openprompt.plms import load_plm +plm, tokenizer, model_config, WrapperClass = load_plm("t5", "t5-base") + +# Constructing Template +# A template can be constructed from the yaml config, but it can also be constructed by directly passing arguments. +from openprompt.prompts import MixedTemplate +template_text = '{"placeholder":"text_a"} {"soft"} {"soft"} {"soft"} {"placeholder":"text_b"}? {"soft"} {"soft"} {"soft"} {"mask"}.' +mytemplate = MixedTemplate(model=plm, tokenizer=tokenizer, text=template_text) + +# To better understand how does the template wrap the example, we visualize one instance. + +wrapped_example = mytemplate.wrap_one_example(dataset['train'][0]) +print(wrapped_example) + +# Now, the wrapped example is ready to be pass into the tokenizer, hence producing the input for language models. +# You can use the tokenizer to tokenize the input by yourself, but we recommend using our wrapped tokenizer, which is a wrapped tokenizer tailed for InputExample. +# The wrapper has been given if you use our `load_plm` function, otherwise, you should choose the suitable wrapper based on +# the configuration in `openprompt.plms.__init__.py`. +# Note that when t5 is used for classification, we only need to pass to decoder. +# The loss is calcaluted at . Thus passing decoder_max_length=3 saves the space +wrapped_t5tokenizer = WrapperClass(max_seq_length=128, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head") +# or +from openprompt.plms import T5TokenizerWrapper +wrapped_t5tokenizer= T5TokenizerWrapper(max_seq_length=128, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head") + +# You can see what a tokenized example looks like by +tokenized_example = wrapped_t5tokenizer.tokenize_one_example(wrapped_example, teacher_forcing=False) +print(tokenized_example) +print(tokenizer.convert_ids_to_tokens(tokenized_example['input_ids'])) +print(tokenizer.convert_ids_to_tokens(tokenized_example['decoder_input_ids'])) + +# Now it's time to convert the whole dataset into the input format! +# Simply loop over the dataset to achieve it! + +model_inputs = {} +for split in ['train', 'validation', 'test']: + model_inputs[split] = [] + for sample in dataset[split]: + tokenized_example = wrapped_t5tokenizer.tokenize_one_example(mytemplate.wrap_one_example(sample), teacher_forcing=False) + model_inputs[split].append(tokenized_example) + + +# We provide a `PromptDataLoader` class to help you do all the above matters and wrap them into an `torch.DataLoader` style iterator. +from openprompt import PromptDataLoader + +train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer, + tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3, + batch_size=4,shuffle=True, teacher_forcing=False, predict_eos_token=False, + truncate_method="head") + + +# Define the verbalizer +# In classification, you need to define your verbalizer, which is a mapping from logits on the vocabulary to the final label probability. Let's have a look at the verbalizer details: + +from openprompt.prompts import ManualVerbalizer +import torch + +# for example the verbalizer contains multiple label words in each class +myverbalizer = ManualVerbalizer(tokenizer, num_classes=3, label_words=[["yes"], ["no"], ["maybe"]]) + +print("label_words_ids", myverbalizer.label_words_ids) + +# Although you can manually combine the plm, template, verbalizer together, we provide a pipeline +# model which take the batched data from the PromptDataLoader and produce a class-wise logits + +from opendelta import LoraModel +# delta_model = LoraModel(backbone_model=plm, modified_modules=[]) +delta_model = LoraModel(backbone_model=plm, modified_modules=["SelfAttention.q", "SelfAttention.v"]) +delta_model.freeze_module(exclude=["deltas"], set_state_dict=True) +delta_model.log() + +from openprompt import PromptForClassification + +use_cuda = True +prompt_model = PromptForClassification(plm=plm, template=mytemplate, verbalizer=myverbalizer) +if use_cuda: + prompt_model = prompt_model.cuda() + +# Now the training is standard +from transformers import AdamW, get_linear_schedule_with_warmup +loss_func = torch.nn.CrossEntropyLoss() +no_decay = ['bias', 'LayerNorm.weight'] +# it's always good practice to set no decay to biase and LayerNorm parameters +optimizer_grouped_parameters = [ + {'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} +] +print([n for n, p in prompt_model.named_parameters()]) + +optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4) + +for epoch in range(30): + tot_loss = 0 + for step, inputs in enumerate(train_dataloader): + if use_cuda: + inputs = inputs.cuda() + logits = prompt_model(inputs) + labels = inputs['label'] + loss = loss_func(logits, labels) + loss.backward() + tot_loss += loss.item() + optimizer.step() + optimizer.zero_grad() + if step %100 ==1: + print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True) + +# Evaluate +validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer, + tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3, + batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False, + truncate_method="head") + +allpreds = [] +alllabels = [] +for step, inputs in enumerate(validation_dataloader): + if use_cuda: + inputs = inputs.cuda() + logits = prompt_model(inputs) + labels = inputs['label'] + alllabels.extend(labels.cpu().tolist()) + allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist()) + +acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds) +print(acc) \ No newline at end of file diff --git a/opendelta/__init__.py b/opendelta/__init__.py new file mode 100644 index 0000000..5e02fcb --- /dev/null +++ b/opendelta/__init__.py @@ -0,0 +1,17 @@ + +__version__ = "0.0.1" + +from .delta_configs import BaseDeltaConfig +from .utils import logging +from .utils.saving_loading_utils import SaveLoadMixin +from .basemodel import DeltaBase +from .auto_delta import AutoDeltaConfig, AutoDeltaModel +from .utils.structure_mapping import CommonStructureMap +from .delta_models.lora import LoraModel +from .delta_models.bitfit import BitFitModel +from .delta_models.compacter import CompacterModel +from .delta_models.adapter import AdapterModel +from .delta_models.prefix import PrefixModel +from .delta_models.soft_prompt import SoftPromptModel +from .delta_models.low_rank_adapter import LowRankAdapterModel +from .utils.visualization import Visualization \ No newline at end of file diff --git a/opendelta/auto_delta.py b/opendelta/auto_delta.py new file mode 100644 index 0000000..9680c36 --- /dev/null +++ b/opendelta/auto_delta.py @@ -0,0 +1,423 @@ +from copy import deepcopy +from typing import Any, Dict, OrderedDict +from opendelta.utils.visualization import Visualization +import torch.nn as nn +from transformers.file_utils import PushToHubMixin +from opendelta.utils.logging import get_logger +import importlib +from opendelta.delta_configs import BaseDeltaConfig + +logger = get_logger(__name__) + + +DELTA_CONFIG_MAPPING = { + "lora": "LoraConfig", + "low_rank_adapter": "LowRankAdapterConfig", + "bitfit": "BitFitConfig", + "adapter":"AdapterConfig", + "compacter":"CompacterConfig", + "prefix": "PrefixConfig", + "soft_prompt": "SoftPromptConfig", +} + +DELTA_MODEL_MAPPING = { + "lora": "LoraModel", + "low_rank_adapter": "LowRankAdapterModel", + "bitfit": "BitFitModel", + "adapter":"AdapterModel", + "compacter": "CompacterModel", + "prefix": "PrefixModel", + "soft_prompt": "SoftPromptModel", +} + +class _LazyConfigMapping(OrderedDict): + """ + A dictionary that lazily load its values when they are requested. + """ + + def __init__(self, mapping): + self._mapping = mapping + self._extra_content = {} + self._modules = {} + + def __getitem__(self, key): + if key in self._extra_content: + return self._extra_content[key] + if key not in self._mapping: + raise KeyError(key) + value = self._mapping[key] + module_name = key #model_type_to_module_name(key) + # if module_name not in self._modules: + self._modules[module_name] = importlib.import_module(f".{module_name}", "opendelta.delta_models") + return getattr(self._modules[module_name], value) + + def keys(self): + return list(self._mapping.keys()) + list(self._extra_content.keys()) + + def values(self): + return [self[k] for k in self._mapping.keys()] + list(self._extra_content.values()) + + def items(self): + return [(k, self[k]) for k in self._mapping.keys()] + list(self._extra_content.items()) + + def __iter__(self): + return iter(list(self._mapping.keys()) + list(self._extra_content.keys())) + + def __contains__(self, item): + return item in self._mapping or item in self._extra_content + + def register(self, key, value): + """ + Register a new configuration in this mapping. + """ + if key in self._mapping.keys(): + raise ValueError(f"'{key}' is already used by a Transformers config, pick another name.") + self._extra_content[key] = value + + +LAZY_CONFIG_MAPPING = _LazyConfigMapping(DELTA_CONFIG_MAPPING) + + + +class AutoDeltaConfig: + r""" + This is a generic configuration class that will be instantiated as one of the configuration classes of the library + when created with the :py:meth:`~AutoConfig.from_pretrained` class method. + This class cannot be instantiated directly using ``__init__()`` (throws an error). + """ + + def __init__(self): + raise EnvironmentError( + "AutoConfig is designed to be instantiated " + "using the ``AutoConfig.from_pretrained(pretrained_model_name_or_path)`` method." + ) + + @classmethod + def from_dict(cls, config_dict: Dict[str, Any], **kwargs): + r""" Instantiate a DeltaConfig according to the dict. Automatically load the config specified by + :obj:`delta_type`. + + Args: + config_dict (:obj:`dict`): The dict of configs of delta model. + kwargs: Other keyword argument pass to initialize the config. + + >>> config = AutoDeltaConfig.from_dict({"delta_type":"lora"}) # This will load the dault lora config. + >>> config = AutoDeltaConfig.from_dict({"delta_type":"lora", "lora_r":5}) # Will load the default lora config, with lora_r = 5 + + """ + config_dict = deepcopy(config_dict) + delta_type = config_dict.pop("delta_type", None) + if delta_type is None: + raise RuntimeError("Do not specify a delta type, cannot load the default config") + config_class = LAZY_CONFIG_MAPPING[delta_type] + return config_class.from_dict(config_dict, **kwargs) + + + @classmethod + def from_finetuned(cls, finetuned_model_name_or_path, **kwargs): + r""" + Instantiate one of the configuration classes of the library from a finetuned delta model configuration. + The configuration class to instantiate is selected based on the ``delta_type`` property of the config object that + is loaded. + + Parameters: + + finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, *optional*): + Can be either: + + - A string, the *model id* of a finetuned delta model configuration hosted inside a model repo on + huggingface.co. Valid model ids can be located at the root-level, like ``Davin/lora``, or + namespaced under a user or organization name, like ``DeltaHub/lora_t5-base_mrpc``. + - A path to a *directory* containing a configuration file saved using the + :py:meth:`DeltaBase.save_finetuned` method, + e.g., ``./my_model_directory/``. + - A path or url to a saved configuration JSON *file*, e.g., + ``./my_model_directory/configuration.json``. + The last two option are not tested but inherited from huggingface. + cache_dir (:obj:`str` or :obj:`os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + force_download (:obj:`bool`, *optional*, defaults to :obj:`False`): + Whether or not to force the (re-)download the model weights and configuration files and override the + cached versions if they exist. + resume_download (:obj:`bool`, *optional*, defaults to :obj:`False`): + Whether or not to delete incompletely received files. Will attempt to resume the download if such a + file exists. + proxies (:obj:`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + revision(:obj:`str`, *optional*, defaults to ``"main"``): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + return_unused_kwargs (:obj:`bool`, *optional*, defaults to ``False``): + If ``False``, then this function returns just the final configuration object. + If ``True``, then this functions returns a ``Tuple(config, unused_kwargs)`` where *unused_kwargs* is a + dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., the + part of ``kwargs`` which has not been used to update ``config`` and is otherwise ignored. + trust_remote_code (:obj:`bool`, *optional*, defaults to ``False``): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to ``True`` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. + kwargs(additional keyword arguments, *optional*): + The values in kwargs of any keys which are configuration attributes will be used to override the loaded + values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled + by the ``return_unused_kwargs`` keyword parameter. + + Examples: + + .. code-block:: python + + from transformers import AutoConfig + delta_config = AutoDeltaConfig.from_finetuned("DeltaHub/lora_t5-base-mrpc") + + """ + + kwargs["name_or_path"] = finetuned_model_name_or_path + + config_dict, _ = BaseDeltaConfig.get_config_dict(finetuned_model_name_or_path, **kwargs) + if "delta_type" in config_dict: + config_class = LAZY_CONFIG_MAPPING[config_dict["delta_type"]] + return config_class.from_dict(config_dict, **kwargs) + else: + # Fallback: use pattern matching on the string. + for pattern, config_class in LAZY_CONFIG_MAPPING.items(): + if pattern in str(finetuned_model_name_or_path): + return config_class.from_dict(config_dict, **kwargs) + + raise ValueError( + f"Unrecognized model in {finetuned_model_name_or_path}. " + f"Should have a `delta_type` key in the loaded config, or contain one of the following strings " + f"in its name: {', '.join(LAZY_CONFIG_MAPPING.keys())}" + ) + +### AutoModels below + +class _LazyAutoMapping(OrderedDict): + """ + " A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed. + + Args: + + - config_mapping: The map model type to config class + - model_mapping: The map model type to model (or tokenizer) class + """ + + def __init__(self, config_mapping, model_mapping): + self._config_mapping = config_mapping + self._reverse_config_mapping = {v: k for k, v in config_mapping.items()} + self._model_mapping = model_mapping + self._extra_content = {} + self._modules = {} + + def __getitem__(self, key): + if key in self._extra_content: + return self._extra_content[key] + model_type = self._reverse_config_mapping[key.__name__] + if model_type not in self._model_mapping: + raise KeyError(key) + model_name = self._model_mapping[model_type] + return self._load_attr_from_module(model_type, model_name) + + def _load_attr_from_module(self, model_type, attr): + if model_type not in self._modules: + self._modules[model_type] = importlib.import_module(f".{model_type}", "opendelta.delta_models") + return getattribute_from_module(self._modules[model_type], attr) + + def keys(self): + mapping_keys = [ + self._load_attr_from_module(key, name) + for key, name in self._config_mapping.items() + if key in self._model_mapping.keys() + ] + return mapping_keys + list(self._extra_content.keys()) + + def get(self, key, default): + try: + return self.__getitem__(key) + except KeyError: + return default + + def __bool__(self): + return bool(self.keys()) + + def values(self): + mapping_values = [ + self._load_attr_from_module(key, name) + for key, name in self._model_mapping.items() + if key in self._config_mapping.keys() + ] + return mapping_values + list(self._extra_content.values()) + + def items(self): + mapping_items = [ + ( + self._load_attr_from_module(key, self._config_mapping[key]), + self._load_attr_from_module(key, self._model_mapping[key]), + ) + for key in self._model_mapping.keys() + if key in self._config_mapping.keys() + ] + return mapping_items + list(self._extra_content.items()) + + def __iter__(self): + return iter(self.keys()) + + def __contains__(self, item): + if item in self._extra_content: + return True + if not hasattr(item, "__name__") or item.__name__ not in self._reverse_config_mapping: + return False + model_type = self._reverse_config_mapping[item.__name__] + return model_type in self._model_mapping + + def register(self, key, value): + """ + Register a new model in this mapping. + """ + if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping: + model_type = self._reverse_config_mapping[key.__name__] + if model_type in self._model_mapping.keys(): + raise ValueError(f"'{key}' is already used by a Transformers model.") + + self._extra_content[key] = value + + + +LAZY_DELTA_MAPPING = _LazyAutoMapping(DELTA_CONFIG_MAPPING, DELTA_MODEL_MAPPING) + + + +def get_values(model_mapping): + result = [] + for model in model_mapping.values(): + if isinstance(model, (list, tuple)): + result += list(model) + else: + result.append(model) + + return result + + +def getattribute_from_module(module, attr): + if attr is None: + return None + if isinstance(attr, tuple): + return tuple(getattribute_from_module(module, a) for a in attr) + if hasattr(module, attr): + return getattr(module, attr) + # Some of the mappings have entries model_type -> object of another model type. In that case we try to grab the + # object at the top level. + transformers_module = importlib.import_module("transformers") + return getattribute_from_module(transformers_module, attr) + + + +class AutoDeltaModel: + r""" + """ + _delta_model_mapping = LAZY_DELTA_MAPPING + def __init__(self, *args, **kwargs): + raise EnvironmentError( + f"{self.__class__.__name__} is designed to be instantiated " + f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or " + f"`{self.__class__.__name__}.from_config(config)` methods." + ) + + @classmethod + def from_config(cls, config, backbone_model, **kwargs): + r"""Automatically instantiates a delta model based on the :obj:`config`. The delta model correspond to the delta + :obj:`config` will be loaded and initialized using the arguments in :obj:`config`. + + .. note:: + Only using :meth:`from_config` method will not load the finetuned weight file (e.g., pytorch_model.bin). + Please use from_finetuned directly. + + Args: + config (:obj:`BaseDeltaConfig`): + backbone_model (:obj:`nn.Module`): + + Examples: + + .. code-block:: python + + config = AutoDeltaConfig.from_finetuned("DeltaHub/lora_t5-base_mrpc") + delta_model = AutoDeltaModel.from_config(config, backbone_model) + + """ + if type(config) in cls._delta_model_mapping.keys(): + model_class = cls._delta_model_mapping[type(config)] + return model_class.from_config(config, backbone_model, **kwargs) + + raise ValueError( + f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" + f"Model type should be one of {', '.join(c.__name__ for c in cls._delta_model_mapping.keys())}." + ) + + @classmethod + def from_finetuned(cls, finetuned_model_name_or_path, backbone_model, *model_args, **kwargs): + r""" Automatically instantiated a delta model and load the finetuned checkpoints based on the + :obj:`finetuned_model_name_or_path`, which can either be a string pointing to a local path or a url pointint to + the delta hub. It will check the hash after loading the delta model to see whether the correct backbone and + delta checkpoint are used. + + Args: + finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, *optional*): + Can be either: + + - A string, the *model id* of a finetuned delta model configuration hosted inside a model repo on + huggingface.co. Valid model ids can be located at the root-level, like ``Davin/lora``, or + namespaced under a user or organization name, like ``DeltaHub/lora_t5-base_mrpc``. + - A path to a *directory* containing a configuration file saved using the + :py:meth:`DeltaBase.save_finetuned` method, + e.g., ``./my_model_directory/``. + - A path or url to a saved configuration JSON *file*, e.g., + ``./my_model_directory/configuration.json``. + The last two option are not tested but inherited from huggingface. + + backbone_model (:obj:`nn.Module`): The backbone model to be modified. + model_args: Other argument for initialize the model. + + Example: + + .. code-block:: python + + delta_model = AutoDeltaModel.from_finetuned("DeltaHub/lora_t5-base-mrpc", backbone_model) + + """ + config = kwargs.pop("config", None) + + if not isinstance(config, BaseDeltaConfig): + config, kwargs = AutoDeltaConfig.from_finetuned( + finetuned_model_name_or_path, return_unused_kwargs=True, **kwargs + ) + if type(config) in cls._delta_model_mapping.keys(): + model_class = cls._delta_model_mapping[type(config)] + return model_class.from_finetuned(finetuned_model_name_or_path, backbone_model, *model_args, **kwargs) + raise ValueError( + f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" + f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}." + ) + + + + + +if __name__ == "__main__": + + config = AutoDeltaConfig.from_dict({"delta_type":"lora", "lora_r": 7}) + + + from transformers import AutoModelForSequenceClassification + model = AutoModelForSequenceClassification.from_pretrained("../../plm_cache/roberta-base/", num_labels=2) + # from IPython import embed + delta_model = AutoDeltaModel.from_config(config, model) + delta_model.freeze_module(exclude = ['deltas','classifier'], set_state_dict = True) + + + # delta_model.save_finetuned("autodelta_try", push_to_hub=True, private=True) + delta_model = AutoDeltaModel.from_finetuned("ShengdingHu/autodelta_try", model, use_auth_token=True) + + + + diff --git a/opendelta/basemodel.py b/opendelta/basemodel.py new file mode 100644 index 0000000..e7dea09 --- /dev/null +++ b/opendelta/basemodel.py @@ -0,0 +1,718 @@ + + +from collections import OrderedDict +from multiprocessing.sharedctypes import Value +import os +from opendelta.delta_configs import BaseDeltaConfig +from opendelta.utils.model_md5 import gen_model_hash +from opendelta.utils.signature import get_arg_names, signature +from typing import Optional, Union +from opendelta.utils.cuda import get_device +from opendelta.utils.name_based_addressing import * +import torch.nn as nn +import torch +from functools import wraps +# from decorator import decorate +from opendelta.utils.decorate import decorate +from opendelta.utils.structure_mapping import transform +from transformers.file_utils import PushToHubMixin +from transformers.deepspeed import deepspeed_config, is_deepspeed_zero3_enabled +from opendelta import SaveLoadMixin +from opendelta import logging +from opendelta.utils.structure_mapping import CommonStructureMap +from opendelta.utils.interactive.web import interactive +from opendelta.utils.data_parallel import new_replicate_for_data_parallel +logger = logging.get_logger(__name__) + +def is_leaf_module(module): + r"""Whether the module is a leaf module + """ + try: + return len([n for n,_ in module.named_children()]) == 0 + except: + from IPython import embed + embed() + +def non_module_param(module: nn.Module): + module_names = [n for n, _ in module.named_modules()] + ret = [] + for n, p in module.named_parameters(): + if not is_child_key(n, module_names): + ret.append((n,p)) + return ret + + + + + +class DeltaBase(nn.Module, SaveLoadMixin): + r"""This is the base class for all delta models. It provides four simple but effective functionalities + for building the delta model: + + #. addressing a module inside the backbone model using a minimal description key. + #. provide the interface for modifying and inserting model which keeps the docs/IO the same as the module + before modification. + #. pass a pseudo input to determine the inter dimension of the delta models. + #. freeze a part of model parameters according to key. + + It also provides unified interface for model loading and saving. + + Class attributes (overridden by derived classes): + + - delta_type (:obj:`str`): the name of the delta modules, used to create the correct :class:`opendelta.AutoDeltaModel`. + - config_class (:class:`BaseDeltaConfig`): The corresponding config model + + + Args: + backbone_model (:obj:`nn.Module`, *required*): backbone model that the delta models are build opon. The modification to the + backbone model are in place. + modified_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules are subjected to update. + + .. note:: + leave this argument :obj:`None` will make the delta model return to the default setting, which add the delta + models to the position experimented the paper. In this setting, the common structure mapping is loaded to + addressing the corresponding modules. + + registraction_name (:obj:`str`, *optional*, default to ``"deltas"``): The root name of the delta models when + attached to the backbone model. + common_structure (:obj:`bool`, *optional*, default to :obj:`None`): Whether use the common structure mapping to specify the + modified_modules. i.e., if common_structure=True, then we use a common ["attn"] for attention module in different models. + We DO NOT recommend manually set ``common_structure`` to ``true`` by yourself unless you are using delta + among multiple backbones and don't want to modify the code. + + interactive_modify (:obj:`bool` or :obj:`int`, *optional*, default to :obj:`None`): Whether to use interactive modification. + By setting to :obj:`int` can specify the port of web server. + """ + delta_type = "" + default_modified_modules = [] + config_class = BaseDeltaConfig + default_unfrozen_modules = ["deltas"] + def __init__(self, + backbone_model: nn.Module, + modified_modules: Optional[List[str]] = None, + unfrozen_modules: Optional[List[str]] = None, + interactive_modify: Optional[Union[bool, int]] = False, + common_structure = False, + ): + nn.Module.__init__(self) + # register the backbone model after init using self.__dict__ method to avoid adding backbone_model + # to the modules of the delta model. + self.__dict__["backbone_model"] = backbone_model + if modified_modules is None: + if interactive_modify: + if isinstance(interactive_modify, bool) and interactive_modify==True: + self.modified_modules = interactive(backbone_model) + else: + self.modified_modules = interactive(backbone_model, port=interactive_modify) + self.common_structure = False + else: + self.modified_modules = self.default_modified_modules + self.common_structure = True + else: + if interactive_modify: + raise ValueError("Use modified_modules and interactive_modify at the same time is not supported") + self.modified_modules = modified_modules + self.common_structure = common_structure + if self.common_structure: + self.structure_mapping = CommonStructureMap.load(self.backbone_model) + else: + self.structure_mapping = None + if unfrozen_modules is None: + self.unfrozen_modules = self.default_unfrozen_modules + if self.common_structure and self.structure_mapping is None: + raise RuntimeError("Using common structure but the structure mapping is None") + + def forward(self, *args, **kwargs) -> "RuntimeError": + r""" + .. warning:: + + Removed method. As the model is a delta model, which should be attached to a backbone model \ + and can't forward any data by itself. Please using the backbone model's forward function \ + after attach the delta model to the backbone. + """ + raise RuntimeError("This is a delta model, which should be attached to a backbone model \ + and can't forward any data by itself. Please using the backbone model's forward function \ + after attach the delta model to the backbone. ") + + @classmethod + def from_config(cls, config: Union[BaseDeltaConfig, dict], backbone_model: nn.Module, check_hash=True, **kwargs): + r"""Initialize a delta model from a config object or a dict containing the configs. To temperarily change + a value in the config, pass it through kwargs. If the config has a backbone model's hash, which means it is + a finetuned delta model's config, then we will compare the hash in the config and the newly caculated to ensure + the finedtuned delta model is trained on the passed backbone_model. Pass ``check_hash=False`` to disable the + checking. + + Args: + config (:obj:`BaseDeltaConfig` or :obj:`dict`) A config object or a dict that contains the necessary value to + initialize the delta model. + backbone_model (:obj:`nn.Module`) A pytorch module that will be pass into the delta model as the backbone + model. modifications will be made in place in the backbone model. + check_hash (:obj:`bool`, default to ``True``) Whether to check hash of the backbone model and the config's + backbone hash. + kwargs: Any configurations that are passed to update the config object. #TODO unit test needed. + """ + supported_keys = get_arg_names(cls.__init__) + get_arg_names(DeltaBase.__init__) + config_dict = config.to_dict() + for key in list(config_dict.keys()): + if key not in supported_keys: + config_dict.pop(key) + return cls(backbone_model, **config_dict) + + + def add_all_delta_to_backbone(self, + backbone: nn.Module, + modified_modules: List[str], + ) -> nn.Module: + r"""The main function to add delta models to the backbone model based on the :obj:`modified_modules`. + + + Args: + backbone_model (:obj:`nn.Module`, *required*) backbone model that the delta models are build opon. The + modification to the backbone model are in place. + modified_modules (:obj:`List[str]`, *optional*, default to :obj:`None`) The modules are subjected to update. + leave this argument :obj:`None` will make the delta model return to the default setting, which add the delta + models to the position experimented the paper. In this setting, the common structure mapping is loaded to + addressing the corresponding modules. + + Returns: + :obj:`nn.Module` The modified backbone model. + + """ + self.plm_total_params = sum(p.numel() for p in backbone.parameters()) + # create a new key list to avoid recursion. + backbone_key_list = [key for key, _ in backbone.named_modules()] + for key in backbone_key_list: + if self.find_key(key, modified_modules): #TODO may have bugs when commonstructure has a virtual node and it's refered + logger.debug("find key: {}".format(key)) + self.update_module(backbone, key) + self._pseudo_data_to_instantiate(backbone) + # mark the paratmers that are the delta parameters for easily displaying the delta_paramters. + self.mark_as_delta() + return backbone + + + + def mark_as_delta(self, module: nn.Module=None,): + r"""[NODOC] Mark :obj:`module`'s all parameters as delta parameters by setting a ``_is_delta`` attribute to each of them. + Generally, it is used after creating the delta modules. By leaving module to :obj:`None`, it will mark all the parameters in the + delta model as ``_is_delta``. + + Args: + module (:obj:`nn.Module`): The module to mark as delta. + """ + if module is None: + module=self # all the parameters in the delta model. + for p in module.parameters(): + setattr(p, "_is_delta", True) + + def update_module(self, module: nn.Module, key: str): + r"""Update a module specified by :obj:`key`. The method is reimplemented in each specific delta model. + """ + raise NotImplementedError + + + def freeze_module(self, + module: Optional[nn.Module] = None, + exclude: Optional[List[str]] = None, + set_state_dict: Optional[bool]=True, + ): + r"""Freeze the parameters of plm. Leave the parameters in exclude untouched. + deltas module is filtered with ``_is_delta`` attributes because it may have parameter sharing to the main + model, (e.g., bias term) + + Args: + module (:obj:`nn.Module`, *optional*, default to :obj:`None`): The module of which some parts are frozen. + If left with :obj:`None`, the function will the self.backbone_model as the module to be frozen. + exclude (:obj:`List[str]`, *optional*, default to ``["deltas"]``): The parameters that don't need to + be freezed. Default to all the delta parameters. + set_state_dict (:obj:`bool`, *optional*, default to :obj:`True`): Whether setting the backbone model's state + dict to all the parameters that still need grad. + prefix (:obj:`str`, *optional*, default to ``""``): A parameters that are used for recursive frozen. + Should not be changed by passing argument other than ``""``. + + """ + if exclude is None: + exclude = self.unfrozen_modules + + if module is None: + module = self.backbone_model + self._freeze_module_recursive(module, exclude, "") # modify the active state dict that still need grad + if set_state_dict: + self.set_active_state_dict(module) + + def _freeze_module_recursive(self, + module: Optional[nn.Module] = None, + exclude: Optional[List[str]] = None, + prefix=""): + r"""[NODOC] Freeze the parameters of plm. Leave the parameters in exclude untouched. + deltas module is filtered with ``_is_delta`` attributes because it may have parameter sharing to the main + model, (e.g., bias term) + + Args: + module (:obj:`nn.Module`, *optional*, default to :obj:`None`): The module of which some parts are frozen. + If left with :obj:`None`, the function will the self.backbone_model as the module to be frozen. + exclude (:obj:`List[str]`, *optional*, default to ``["deltas"]``): The parameters that don't need to + be freezed. Default to all the delta parameters. + set_state_dict (:obj:`bool`, *optional*, default to :obj:`True`): Whether setting the backbone model's state + dict to all the parameters that still need grad. + prefix (:obj:`str`, *optional*, default to ``""``): A parameters that are used for recursive frozen. + Should not be changed by passing argument other than ``""``. + + """ + + if is_leaf_module(module): + for n, p in module.named_parameters(): + if self.find_key(".".join([prefix,n]), exclude, only_tail=True): + continue + if "deltas" not in exclude or (not (hasattr(p, "_is_delta") and getattr(p, "_is_delta"))): + p.requires_grad = False + return + else: + for n, c in module.named_children(): + if self.find_key(".".join([prefix,n]), exclude, only_tail=True): # if found, untouch the parameters + continue + else: # firstly freeze the non module params, then go deeper. + params = non_module_param(module) + for n, p in params: + if "deltas" not in exclude or (not (hasattr(p, "_is_delta") and getattr(p, "_is_delta"))): + p.requires_grad = False + self._freeze_module_recursive(c, exclude=exclude, prefix=".".join([prefix,n]) ) + + + + + + def find_key(self, key: Union[str, re.Pattern], target_list: List[str], only_tail=True): + r"""Check whether any target string is in the key or in the tail of the key, i.e., + + Args: + key (Union[:obj:`str`, :obj:`re.Pattern`]): The key (name) of a submodule in a ancestor module. + E.g., model.encoder.layer.0.attention + target_list (List[:obj:`str`]): The target list that we try to match ``key`` with. E.g., ["attention"] + only_tail (:obj:`bool`): the element in the target_list should be in the tail of key + + Returns: + :obj:`bool` True if the key matchs the target list. + """ + if self.common_structure: + key = self.structure_mapping.transform(key, strict=False) + if not key: + return False + try: + if isinstance(key, re.Pattern): # TODO: unit test needed ERROR + if only_tail: + return endswith_in_regex(key, target_list) + else: + return substring_in_regex(key, target_list) + else: + if only_tail: + return endswith_in(key, target_list) + else: + return substring_in(key, target_list) + except: + from IPython import embed + embed(header = "exception") + + def _pseudo_data_to_instantiate(self, module: Optional[nn.Module]=None): + r"""Create a pseudo_data into the module to know the dimemsion of each tensor in the computation graph. + First try to use the dummy_inputs of the pretrained model. If the model has no dummy_inputs, will try to create + integer tensor as the pseudo_input, if ``decoder_input_ids`` is in the model's forward function, additional create it. + + Args: + module (:obj:`nn.Module`, *optional*, default to :obj:`None`): The backbone model. + + """ + if module is None: + module = self.backbone_model + try: + dummy_inputs = module.dummy_inputs + module(**dummy_inputs) + except AttributeError: + device = get_device(module) + logger.warning("No dummy_inputs attributes, create a common input_ids for input.") + pseudo_input = torch.tensor([[0,0]]).to(device) + if "decoder_input_ids" in signature(module.forward).args: + module(pseudo_input, decoder_input_ids = pseudo_input) + else: + module(pseudo_input) + + def trainable_parameters_names(self, module: Optional[nn.Module]=None): + r"""[NODOC] A small sugar function to return all the trainable parameter's name in the (by default, backbone) model. + + Args: + module (:obj:`nn.Module`): of which module we want to know the trainable paramemters' name. + + Returns: + :obj:`List[str]` + """ + if module is None: + module = self.backbone_model + return [n for n,p in module.named_parameters() if p.requires_grad] + + def frozen_parameters_names(self, module: Optional[nn.Module]=None): + r"""[NODOC] A small sugar function to return all the frozen parameters' name in the (by default, backbone) model. + + Args: + module (:obj:`nn.Module`): of which module we want to know the frozen paramemters' name. + + Returns: + :obj:`List[str]` + """ + if module is None: + module = self.backbone_model + return [n for n,p in module.named_parameters() if not p.requires_grad] + + def trainable_parameters(self,module: Optional[nn.Module]=None): + r"""[NODOC] A small sugar function to return all the frozen parameters in the (by default, backbone) model. + + Args: + module (:obj:`nn.Module`): of which module we want to know the frozen paramemters. + + Returns: + :obj:`List[nn.Parameter]` + """ + if module is None: + module = self + return [p for n,p in module.named_parameters() if p.requires_grad] + + + def num_trainable_parameters(self, module: Optional[nn.Module]=None): + r"""[NODOC] A small sugar function to get the number of trainable parameter in the backbone model. Often used to + compute the trainable rate. + + Args: + module (:obj:`nn.Module`): of which module we want to know the number of trainable paramemters. + + Returns: + :obj:`List[nn.Parameter]` + """ + if module is None: + module = self + pnum_tot = 0 + for param in module.parameters(): + if param.requires_grad: + pnum_tot += param.numel() + return pnum_tot + + def num_total_parameters(self, module: Optional[nn.Module]=None): + r"""[NODOC] A small sugar function to get the number of trainable parameter in the backbone model. Often used to + compute the trainable rate. + + Args: + module (:obj:`nn.Module`): of which module we want to know the number of trainable paramemters. + + Returns: + :obj:`List[nn.Parameter]` + """ + if module is None: + module = self + pnum_tot = 0 + for param in module.parameters(): + pnum_tot += param.numel() + return pnum_tot + + + + def find_module(self, root_module: nn.Module, key:str): + r"""Find the module using a key and the root module. Return both the parent reference, the child name and reference. + + Args: + root_module (:obj:`root_module`): The root_module to find the sub module in + key (:obj:`str`): The relative key to the root module. + + Returns: + (:obj:`nn.Module`, :obj:`str`, :obj:`nn.Module`): + * A reference to the parent module of the target module, mainly for substuting the target module. + * The key of the target module relevant to its parent module + * Target module. + """ + sub_keys = key.split(".") + parent_module = root_module + for sub_key in sub_keys[:-1]: + parent_module = getattr(parent_module, sub_key) + module = getattr(parent_module, sub_keys[-1]) + return parent_module, sub_keys[-1], module + + def _register_delta_infos(self, parent_module, _delta_info): + r"""Register the delta infomation. + Automatically incrementing the suffix for repeated delta_names + """ + _delta_infos = getattr(parent_module, "_delta_infos", []) + if len(_delta_infos) > 0: # check if duplicated name + list_of_deltas = [d['delta_name'] for d in _delta_infos] + cur_name = _delta_info['delta_name'] + if cur_name in list_of_deltas: + cur_name = cur_name + "_1" + counter = 1 + while cur_name in list_of_deltas: + counter += 1 + cur_name = cur_name.split("_")[0] + "_"+str(counter) + _delta_info["delta_name"] = cur_name + _delta_infos.append(_delta_info) + setattr(parent_module, "_delta_infos", _delta_infos) + + def replace_module(self, + parent_module: nn.Module, + child_name: str, + child_module: nn.Module, + new_module: nn.Module, + delta_name: Optional[str] = "delta", + ): + r"""Replace a module's child module with the new_module(a delta module). Used by delta method based on direct + replacement, such as :class:`opendelta.delta_modules.lora.LoraModel`. + + Args: + parent_module (:obj:`nn.Module`): The parent module of the replacement. + child_name (:obj:`str`): The chird module's name, i.e., parent_module.child_name give us child_module + child_module (:obj:`nn.Module`): The original child module. + new_module (:obj:`nn.Module`): The delta module. + delta_name (:obj:`str`, *optional*, default ot ``delta``): The name of the delta module, used for recording. + parent_module.delta_name WILL NOT give you the delta module. + """ + self.delta_modules.append(new_module) + setattr(parent_module, child_name, new_module) + # register delta info + _delta_info = {"method": "replace", + "delta_module": new_module, + "child_name": child_name, + "org_module": child_module, + "delta_name": delta_name, + "delta_belong": self, + "state": "on"} + self._register_delta_infos(parent_module=parent_module, + _delta_info = _delta_info, + ) + + + def modify_module(self, module: nn.Module): + r"""Modify the inside parameteres of a module. This method will be reimplemented in different + derived class if needed. + """ + raise NotImplementedError + + def insert_sequential_module(self, module, delta_module=None, name='delta', strict=False, _delta_info=None): + r"""insert a module (previous not exists in the code base) before/after a module. Specifically, it modifies the forward + function of the original module to firstly pass the arguments into the new module's forward function and then pass + it into the original ones. The new module can also be inserted after the original module with similar mechanism. + + When implementing the new module , researchers should be aware of the components of arguments of the original module's forward function. + + Args: + module: (:obj:`nn.Module`): The (sub)module to inserted a delta module. + delta_module: (:obj:`DeltaBase`): The delta module to be inserted. + name: (:obj:`str`, *optional*): The name of the delta in the backbone module. + strict: (:obj:`bool`, *optional*): Whether to prohibit modify a modified module. + _delta_info (:obj:`Dict`, *optional*): Used in attach(), reattach a delta module to backbone. The info of + original delta is passed through ``_delta_info``. + + """ + def _caller(_org_func, org_module, delta_name, *args, **kwargs): + args = args[1:] # the first argument here is ``self`` + delta_module = getattr(org_module, delta_name) + if hasattr(delta_module, "pre_forward"):# is not None: + args, kwargs = delta_module.pre_forward(*args, **kwargs) + # from IPython import embed + # embed(header = "true") + ret = _org_func(*args, **kwargs) + if hasattr(delta_module, "post_forward"):# is not None: + ret = delta_module.post_forward(ret) + return ret + + + if strict: + if hasattr(module.forward, "__wrapped__"): + raise RuntimeWarning("The forward function might have been wrapped by a decorator, is it intended?") + + # record info for plug and unplug and nested wrap + if _delta_info is None: + if delta_module is None: + raise RuntimeError("delta module can't be none to ensure successful replicate of the parent module.") + + _delta_info = {"method": "insert_sequential", + "delta_module": delta_module, + "delta_name": name, + "delta_belong": self, + "state": "on"} + self._register_delta_infos(parent_module=module, + _delta_info = _delta_info) + else: + delta_module = _delta_info["delta_module"] + name = _delta_info["delta_name"] + + setattr(module, _delta_info['delta_name'], _delta_info["delta_module"]) + + new_forward = decorate(module.forward, _caller, extras=(module, _delta_info['delta_name']), kwsyntax=True) # decorator.decorate helps preserving the functions metadata (signature, etc.). + module.forward = new_forward.__get__(module, type(module)) # func.__get__(object, type(object)) register a function as an object's method + # for DataParallel's copy behavior. Experimental: + # may have bugs when module.forward is nestedly wrapped. + module._replicate_for_data_parallel = new_replicate_for_data_parallel.__get__(module, type(module)) + + + + + + def insert_parrellel_module(self, module, pre_caller=None, post_caller=None, delta_module=None, name='delta'): + """insert a module (previous not exists in the code base) across a module. Specifically, it modifies the forward + function of the original module to firstly pass the arguments into the delta model's forward function and set + aside the calculation result. Then combine it with the calculation result output from the backbone module. + + When implementing the new module , researchers should be aware of the arguments and keywards of the original module's forward function. + + # TODO: currently not in use. + """ + raise NotImplementedError + + def set_active_state_dict(self, module: nn.Module): + r"""modify the state_dict function of the model (by default, the backbone model) to return only the tunable part. + + Args: + module (:obj:`nn.Module`): The module modified. The modification is in-place. + """ + def _caller(_org_func, includes, *args, **kwargs): + state_dict = _org_func(*args, **kwargs) + keys = list(state_dict.keys()) + for n in keys: + if n not in includes: + state_dict.pop(n) + return state_dict + includes = self.trainable_parameters_names(module) # use excludes will have trouble when the model have shared weights + # print(includes, "grad:",self.backbone_model.plm.lm_head.weight.requires_grad) + # exit() + if hasattr(module.state_dict, "__wrapped__"): + raise RuntimeWarning("The forward function might have been wrapped by a decorator, is it intended?") + module.state_dict = decorate(module.state_dict, _caller, extras=(includes,), kwsyntax=True) # decorator.decorate helps preserving the functions metadata (signature, etc.). + + def _load_state_dict_into_backbone(self, backbone_model: nn.Module = None, state_dict: dict = {}): + r"""[NODOC] + """ + if backbone_model is None: + backbone_model = self.backbone_model + self.backbone_model.load_state_dict(state_dict, strict=False) + + def create_config_from_model(self, ): + r"""[NODOC] If the delta model was built by directly passing arguments, instead of passing a config object. + create the config of the delta model for saving the delta model. + """ + # common_attributes + config = self.config_class() + config_keys = signature(config.__init__)[0] + signature(super(self.config_class, config).__init__)[0] + + for key in config_keys: + val = getattr(self, key) if hasattr(self, key) else None + setattr(config, key, val) + config.delta_type = self.delta_type + self.config = config + + + def log(self, module=None, delta_ratio=True, trainable_ratio=True, visualization=True): + r"""Log and visualize the result of applying delta. + Possible Options are ``trainable_ratio``, + ``visualization``, ``delta_ratio``. + + Args: + delta_ratio (:obj:`bool`, *optional*): Whether computing the ratio of parameters in the delta modules. + trainable_ratio (:obj:`bool`, *optional*): Whether computing the ratio of trainable parameters. + visualization (:obj:`bool`, *optional*): Whether visualize the parameter information of the modified backbone. + + """ + if module is None: + module = self.backbone_model + + + if visualization: + from opendelta import Visualization + Visualization(module).structure_graph() + if trainable_ratio: + n_trainable = self.num_trainable_parameters(module) + n_total = self.num_total_parameters(module) + logger.info("Trainable Ratio: {:2f}%".format(n_trainable/n_total*100)) + if delta_ratio: + n_delta = self.num_delta_parameters(module) + n_total = self.num_total_parameters(module) + logger.info("Delta Parameter Ratio: {:2f}%".format(n_delta/n_total*100)) + + def num_delta_parameters(self, module: Optional[nn.Module]=None): + r"""[NODOC] A small sugar function to get the number of trainable parameter in the backbone model. Often used to + compute the trainable rate. + + Args: + module (:obj:`nn.Module`): of which module we want to know the number of trainable paramemters. + + Returns: + :obj:`List[nn.Parameter]` + """ + if module is None: + module = self.backbone_model + pnum_tot = 0 + for param in module.parameters(): + if hasattr(param, "_is_delta"): + pnum_tot += param.numel() + return pnum_tot + + # Two functions for plug and remove the delta model. + def attach(self, module: Optional[nn.Module]=None,): + r"""Reattach the delta modules to the backbone. Note that this method can not be used to create new delta modules. + Instead, a :meth:`DeltaBase.detach` should precede this method. + + Args: + module (:obj:`object`, *optional*, default to :obj:`None`): The backbone module that we + reattach the deltas to. + """ + + if module is None: + module = self.backbone_model + + for name, submodule in module.named_modules(): + if hasattr(submodule, "_delta_infos"): + _delta_infos = getattr(submodule, "_delta_infos") + for _delta_info in _delta_infos: + if _delta_info['delta_belong'] is not self: + continue + if _delta_info["state"] == "on": + continue + + if _delta_info['method'] == "replace": + setattr(submodule, _delta_info["child_name"], _delta_info['delta_module']) + elif _delta_info['method'] == "insert_sequential": + self.insert_sequential_module(module=submodule, + _delta_info=_delta_info) + else: + raise NotImplementedError + + _delta_info['state'] = "on" + + + def detach(self, module: Optional[nn.Module]=None,): + r"""Detach the delta module from the backbone. The delta module is not deleted, but temporarily turned off. + Use :meth:`DeltaBase.attach` to reattach the delta model to the backbone. + + Args: + module (:obj:`object`, *optional*, default to :obj:`None`): The backbone module that we + detached the deltas from. + """ + + if module is None: + module = self.backbone_model + + for name, submodule in module.named_modules(): + if hasattr(submodule, "_delta_infos"): + _delta_infos = getattr(submodule, "_delta_infos") + for _delta_info in _delta_infos: + if _delta_info['delta_belong'] is not self: + continue + if _delta_info["state"] == "off": + continue + + if _delta_info['method'] == "replace": + setattr(submodule, _delta_info["child_name"], _delta_info['org_module']) + elif _delta_info['method'] == "insert_sequential": + if hasattr(submodule.forward, "__wrapped__"): + submodule.forward = submodule.forward.__wrapped__ + delattr(submodule, _delta_info["delta_name"]) + else: + raise AttributeError("submodule {}'s forward has no attribute __wrapped__. It'ss not a wrapped function.".format(name)) + else: + raise NotImplementedError + + _delta_info['state'] = "off" + diff --git a/opendelta/delta_configs.py b/opendelta/delta_configs.py new file mode 100644 index 0000000..3958da1 --- /dev/null +++ b/opendelta/delta_configs.py @@ -0,0 +1,476 @@ +import os +import re +from typing import Union, Dict, Any, Tuple, Optional +from opendelta import __version__ as opendelta_version +from opendelta.utils import logging +from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func +import transformers +from transformers.file_utils import ( + PushToHubMixin, + is_offline_mode, + cached_path, + is_remote_url, + get_list_of_files, + hf_bucket_url, +) +from packaging import version +import json +import copy + +CONFIG_NAME = "config.json" +transformers_version = transformers.__version__ + +checked_package_versions = ["transformers_version", "opendelta_version"] + +logger = logging.get_logger(__name__) +FULL_CONFIGURATION_FILE = "config.json" +_re_configuration_file = re.compile(r"config\.(.*)\.json") + +class BaseDeltaConfig(PushToHubMixin): + r"""Base class for all configuration classes. Handles a few + parameters common to all delta models' configurations as well as methods for loading/downloading/saving configurations. + + Class attributes (overridden by derived classes): + + - **delta_type** (:obj:`str`) -- the name of the delta modules, used to create the correct :py:class:`~opendelta.AutoConfig`. + + Args: + modified_modules (:obj:`List[str]`, *optional*, defaults to :obj:``None``) + The list of keys to determine which modules you want to modify. OpenDelta will take every modulees that + **ends with** the one of the provided keys as the modification target. When not given any value, i.e. + ``modified_modules=None``, the delta module will use the it corresponding default modification modules. + Taking DistilBertModel with an classifier on top as an example: + + .. note:: + **Examples**: When adding delta to DistilBertModel, + + 1. set to ``["0.attention.out_lin"]`` will add delta modules to the attention output of distilbert's + ayer 0, i.e., ``distilbert.transformer.layer.0.attention.out_lin``. + + 2. set to ``["attention.out_lin"]`` will add the delta modules in every layer's ``attention.out_lin``. + + unfrozen_modules (:obj:`List[str]`, *optional*, defaults to :obj:`["deltas"]` ) + The modules that are unfrozen + during training. Including the ones that are newly introduced as delta modules, and the ones that are + originally a part of the model but set to trainable (:obj:`requires_grad=True`) to train together with the + delta modules. OpenDelta will take every modules that **ends with** the one of the provided keys and all + its sub-modules and paramters as trainable. + + .. note:: + **Examples**: When adding delta to DistilBertModel, + + 1. set this argument to ``["bias"]`` will make all bias terms tunable. + + 2. set this argument to ``["attention"]`` will make all parameters in all attention modules tunable. + + 3. set this argument to ``["deltas"]`` will make all the parameters in the newly introduced delta + modules tunable. + + 4. set this argument to ``["classifier"]`` will make all parameters in the classifier tunable. + + 5. set this argument to ``["3.ffn.lin2", "deltas", "classifier"]``, will make all parameters in + the third layer's feed forward layer's send linear layer, the detla modules, and the classifiers modules + tunable. + + common_structure (:obj:`bool`, *optional*, default to :obj:`None`): Whether using the common structure mapping of + the transformer model when designating :obj:`modified_modules` and :obj:`unfrozen_modules`. + backbone_class (:obj:`str`, *optional*, default to :obj:`None`): The name of backbone model's class, e.g. + ``RobertaForMaskedLM``. Saving this infomation let the users explicitly know on which backbone the + delta model is trained. + backbone_checkpoint_name (:obj:`str`, *optional*, default to :obj:`None`): The specific checkpoint of the model. + In ideal case, it should be the url to download the checkpoint. However, we do not force the user to + specify a downloadable url here. + backbone_hash (:obj:`str`, *optional*, default to :obj:`None`): The md5-hash of the backbone model. It is + calculated using the string representation of the model and the sequential expansion of all the + parameters in the model. When loading a delta checkpoint in strict mode, the hash of the backbone model + will be compared to the hash in this config. + """ + delta_type: str = "" + + + def __init__(self, + modified_modules = None, + unfrozen_modules = ["deltas"], + common_structure=False, + backbone_class = None, + backbone_checkpoint_name = None, + backbone_hash = None, + ): + arg_names = get_arg_names(BaseDeltaConfig.__init__) + for arg_name in arg_names: + setattr(self, arg_name, locals()[arg_name]) + + + + + @classmethod + def from_finetuned(cls, finetuned_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "BaseDeltaConfig": + r""" + Instantiate a :obj:`BaseDeltaConfig` (or a derived class) from a finetined delta module configuration. + + Args: + finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): This can be either: + + * a string, the *model id* of a finetuned delta model configuration hosted inside a model repo on + deltahub.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or + namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. + + * a path to a *directory* containing a configuration file saved using the :meth:`BaseDeltaConfig.save_finetuned` method, e.g., ``./my_model_directory/``. + + * a path or url to a saved configuration JSON *file*, e.g., ``./my_model_directory/configuration.json``. + + cache_dir (:obj:`str` or :obj:`os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained delta model configuration should be cached if the + standard cache should not be used. + + .. code-block:: python + + delta_config = LoraConfig.from_finetuned("DeltaHub/lora_t5-base_mrpc") + + """ + config_dict, kwargs = cls.get_config_dict(finetuned_model_name_or_path, **kwargs) + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warn( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + def save_finetuned(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs): + """ + Save a configuration object to the directory :obj:`save_directory`, so that it can be re-loaded using the + :meth:`BaseDeltaConfig.from_finetuned` class method. + + Args: + save_directory (:obj:`str` or :obj:`os.PathLike`): Directory where the configuration JSON file + will be saved (will be created if it does not exist). + push_to_hub (:obj:`bool`, *optional*, defaults to :obj:`False`): Whether or not to push your model to + the Hugging Face model hub after saving it. + + .. warning:: + 1. Will raise error if you haven't config a Huggingface Model Hub. + 2. Using ``push_to_hub=True`` will synchronize the repository you are pushing to with ``save_directory``, + which requires ``save_directory`` to be a local clone of the repo you are pushing to if it's an existing + folder. Pass along ``temp_dir=True`` to use a temporary directory instead. + + kwargs: + Additional key word arguments passed along to the + `PushToHubMixin.push_to_hub `_ method. + """ + if os.path.isfile(save_directory): + raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") + + if push_to_hub: + commit_message = kwargs.pop("commit_message", None) + repo = self._create_or_get_repo(save_directory, **kwargs) + + os.makedirs(save_directory, exist_ok=True) + # If we save using the predefined names, we can load using `from_pretrained` + output_config_file = os.path.join(save_directory, CONFIG_NAME) + + self.to_json_file(output_config_file, use_diff=True) + logger.info(f"Configuration saved in {output_config_file}") + + if push_to_hub: + url = self._push_to_hub(repo, commit_message=commit_message) + logger.info(f"Configuration pushed to the hub in this commit: {url}") + + @classmethod + def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "BaseDeltaConfig": + r""" + Instantiate a :obj:`BaseDeltaConfig` from a python dictionary of parameters. + + Args: + config_dict (:obj:`Dict[str, Any]`): + Dictionary that will be used to instantiate the configuration object. Such a dictionary can be + retrieved from a pretrained checkpoint by leveraging the :py:meth:`~PretrainedConfig.get_config_dict` method. + kwargs (:obj:`Dict[str, Any]`): + Additional parameters from which to initialize the configuration object. + Returns: + :obj:`BaseDeltaConfig`: The configuration object instantiated from those parameters. + """ + return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) + accept_args = get_arg_names(cls.__init__) + get_arg_names(BaseDeltaConfig.__init__) + unused_config_keys = [] + for config_key in list(config_dict.keys()): + if config_key not in accept_args: + config_dict.pop(config_key) + unused_config_keys.append(config_key) + logger.warning(f"The following keys are not used by {cls}.__init__ function: {unused_config_keys}") + config = cls(**config_dict) + + + # Update config with kwargs if needed + to_remove = [] + for key, value in kwargs.items(): + if hasattr(config, key): + + setattr(config, key, value) + if key != "torch_dtype": + to_remove.append(key) + for key in to_remove: + kwargs.pop(key, None) + logger.info(f"Model config {config}") + + if return_unused_kwargs: + return config, kwargs + else: + return config + + @classmethod + def get_config_dict( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """[NODOC] + From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a + [``PretrainedConfig``] using ``from_dict``. + Parameters: + pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): + The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. + Returns: + :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object. + """ + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + use_auth_token = kwargs.pop("use_auth_token", None) + local_files_only = kwargs.pop("local_files_only", False) + revision = kwargs.pop("revision", None) + # from_pipeline = kwargs.pop("_from_pipeline", None) + from_auto_class = kwargs.pop("_from_auto", False) + + user_agent = {"file_type": "config", "from_auto_class": from_auto_class} + # if from_pipeline is not None: + # user_agent["using_pipeline"] = from_pipeline + + if is_offline_mode() and not local_files_only: + logger.info("Offline mode: forcing local_files_only=True") + local_files_only = True + + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): + config_file = pretrained_model_name_or_path + else: + configuration_file = get_configuration_file( + pretrained_model_name_or_path, + revision=revision, + use_auth_token=use_auth_token, + local_files_only=local_files_only, + ) + + + if os.path.isdir(pretrained_model_name_or_path): + config_file = os.path.join(pretrained_model_name_or_path, configuration_file) + else: + config_file = hf_bucket_url( + pretrained_model_name_or_path, filename=configuration_file, revision=revision, mirror=None + ) + + try: + # Load from URL or cache if already cached + resolved_config_file = cached_path( + config_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + user_agent=user_agent, + ) + # Load config dict + config_dict = cls._dict_from_json_file(resolved_config_file) + + except EnvironmentError as err: + logger.error(err) + msg = ( + f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n" + f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n" + f" (make sure '{pretrained_model_name_or_path}' is not a path to a local directory with something else, in that case)\n\n" + f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {CONFIG_NAME} file\n\n" + ) + + if revision is not None: + msg += f"- or '{revision}' is a valid git identifier (branch name, a tag name, or a commit id) that exists for this model name as listed on its model page on 'https://huggingface.co/models'\n\n" + + raise EnvironmentError(msg) + + except (json.JSONDecodeError, UnicodeDecodeError): + msg = ( + f"Couldn't reach server at '{config_file}' to download configuration file or " + "configuration file is not a valid JSON file. " + f"Please check network or file content here: {resolved_config_file}." + ) + raise EnvironmentError(msg) + + if resolved_config_file == config_file: + logger.info(f"loading configuration file {config_file}") + else: + logger.info(f"loading configuration file {config_file} from cache at {resolved_config_file}") + + return config_dict, kwargs + + @classmethod + def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]): + with open(json_file, "r", encoding="utf-8") as reader: + text = reader.read() + return json.loads(text) + + def __repr__(self): + return f"{self.__class__.__name__} {self.to_json_string()}" + + def __eq__(self, other): + return self.__dict__ == other.__dict__ + + def to_json_string(self, use_diff: bool = True) -> str: + """[NODOC] + Serializes this instance to a JSON string. + Args: + use_diff (:obj:`bool`, *optional*, defaults to :obj:`True`): + If set to :obj:`True`, only the difference between the config instance and the default ``PretrainedConfig()`` + is serialized to JSON string. + Returns: + :obj:`str`: String containing all the attributes that make up this configuration instance in JSON format. + """ + if use_diff is True: + config_dict = self.to_diff_dict() + else: + config_dict = self.to_dict() + return json.dumps(config_dict, indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True): + """[NODOC] + Save this instance to a JSON file. + Args: + json_file_path (:obj:`str` or :obj:`os.PathLike`): + Path to the JSON file in which this configuration instance's parameters will be saved. + use_diff (:obj:`bool`, *optional*, defaults to :obj:`True`): + If set to :obj:`True`, only the difference between the config instance and the default ``PretrainedConfig()`` + is serialized to JSON file. + """ + with open(json_file_path, "w", encoding="utf-8") as writer: + writer.write(self.to_json_string(use_diff=use_diff)) + + def to_diff_dict(self) -> Dict[str, Any]: + """[NODOC] + Removes all attributes from config which correspond to the default config attributes for better readability and + serializes to a Python dictionary. + Returns: + :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, + """ + config_dict = self.to_dict() + + # get the default config dict + default_config_dict = BaseDeltaConfig().to_dict() + + # get class specific config dict + class_config_dict = self.__class__().to_dict() #if not self.is_composition else {} + + serializable_config_dict = {} + + # only serialize values that differ from the default config + for key, value in config_dict.items(): + if ( + key not in default_config_dict + or key in checked_package_versions + or value != default_config_dict[key] + or (key in class_config_dict and value != class_config_dict[key]) + ): + serializable_config_dict[key] = value + + self.dict_torch_dtype_to_str(serializable_config_dict) + + return serializable_config_dict + + def update(self, config_dict: Dict[str, Any]): + """[NODOC] + Updates attributes of this class with attributes from ``config_dict``. + Args: + config_dict (:obj:`Dict[str, Any]`): Dictionary of attributes that should be updated for this class. + """ + for key, value in config_dict.items(): + setattr(self, key, value) + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes this instance to a Python dictionary. + Returns: + :obj:`dict`: Dictionary of all the attributes that make up this configuration instance. + """ + output = copy.deepcopy(self.__dict__) + if hasattr(self.__class__, "model_type"): + output["model_type"] = self.__class__.model_type + + # Transformers version when serializing the model + output["transformers_version"] = transformers_version + output["opendelta_version"] = opendelta_version + + self.dict_torch_dtype_to_str(output) + + return output + + def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None: + """[NODOC] + Checks whether the passed dictionary has a *torch_dtype* key and if it's not None, converts torch.dtype to a + string of just the type. For example, ``torch.float32`` get converted into *"float32"* string, which can then be + stored in the json format. + """ + if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str): + d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1] + + + + +def get_configuration_file( + path_or_repo: Union[str, os.PathLike], + revision: Optional[str] = None, + use_auth_token: Optional[Union[bool, str]] = None, + local_files_only: bool = False, +) -> str: + """ + Get the configuration file to use for this version of transformers. + Args: + path_or_repo (`:obj:str` or `:obj:os.PathLike`): + Can be either the id of a repo on huggingface.co or a path to a *directory*. + revision(`:obj:str`, *optional*, defaults to ``"main"``): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any + identifier allowed by git. + use_auth_token (:obj:`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token generated + when running ``transformers-cli login`` (stored in ``~/.huggingface``). + local_files_only (:obj:`bool`, *optional*, defaults to :obj:`False`): + Whether or not to only rely on local files and not to attempt to download any files. + Returns: + :obj:`str`: The configuration file to use. + """ + # Inspect all files from the repo/folder. + all_files = get_list_of_files( + path_or_repo, revision=revision, use_auth_token=use_auth_token, local_files_only=local_files_only + ) + configuration_files_map = {} + for file_name in all_files: + search = _re_configuration_file.search(file_name) + if search is not None: + v = search.groups()[0] + configuration_files_map[v] = os.path.split(file_name)[-1] + available_versions = sorted(configuration_files_map.keys()) + # Defaults to FULL_CONFIGURATION_FILE and then try to look at some newer versions. + configuration_file = FULL_CONFIGURATION_FILE + # transformers_version_ = version.parse(transformers_version) + for v in available_versions: + # if version.parse(v) <= transformers_version_: + configuration_file = configuration_files_map[v] + # else: + # # No point going further since the versions are sorted. + # break + + return configuration_file + + +if __name__ == "__main__": + myconfig = BaseDeltaConfig.from_pretrained("../ckpts/lora/") + myconfig.save_pretrained("../ckpts/lora.1/") + print(myconfig) \ No newline at end of file diff --git a/opendelta/delta_models/__init__.py b/opendelta/delta_models/__init__.py new file mode 100644 index 0000000..c57b864 --- /dev/null +++ b/opendelta/delta_models/__init__.py @@ -0,0 +1,2 @@ +from .lora import LoraModel, LoraConfig +from .bitfit import BitFitModel diff --git a/opendelta/delta_models/adapter.py b/opendelta/delta_models/adapter.py new file mode 100644 index 0000000..3017f01 --- /dev/null +++ b/opendelta/delta_models/adapter.py @@ -0,0 +1,202 @@ +from functools import partial +from random import random +from typing import Optional, Union +from opendelta.utils.signature import get_arg_names_inside_func +from opendelta.utils.name_based_addressing import * +from opendelta.utils.cuda import get_device +from opendelta.basemodel import DeltaBase +import loralib as lora +import torch.nn as nn +import torch +import math +from opendelta.delta_models.layers.activations import Activations +import inspect +from opendelta import BaseDeltaConfig +import opendelta.utils.logging as logging +logger = logging.get_logger(__name__) + +class AdapterLayer(nn.Module): + r"""A layer of adapter tuning module. + """ + layer_count = 0 + + @classmethod + def count_layer(cls): + cls.layer_count += 1 + + @classmethod + def get_layer_count(cls): + return cls.layer_count + + def __init__(self, bottleneck_dim=24, non_linearity='gelu_new', device=None): + super().__init__() + self.bottleneck_dim = bottleneck_dim + self.device = device + self.instantiated = False + self.non_linearity = non_linearity + + self.layer_id = AdapterLayer.get_layer_count() + AdapterLayer.count_layer() + + + def instantiate(self, hidden_dim): + self.modulelist = nn.Sequential() + self.modulelist.add_module("down_proj",nn.Linear(hidden_dim, self.bottleneck_dim, device=self.device)) + + # select non-linearity + self.modulelist.add_module("non_linear", Activations(self.non_linearity.lower())) + + self.modulelist.add_module("up_proj", nn.Linear(self.bottleneck_dim, self.hidden_dim, device=self.device)) + + # TODO: + # If we want to have a layer norm on output, we apply it later after a separate residual connection + # This means that we learn a new output layer norm, which replaces another layer norm learned in the bert layer + # if self.add_layer_norm_after: + # self.adapter_norm_after = nn.LayerNorm(self.input_size) + + self.instantiated = True + # initialize the weight, which is important for fast convergence and better performance. + self.apply(self._init_weight) + + def _init_weight(self, module): + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=0.01) + if module.bias is not None: + module.bias.data.zero_() + + + def post_forward(self, output): + r""" Get the hidden_states from the PLM's layer output, pass it into the adapter, + then combined with the main hidden_states. Finally pass it into the subsequent layer. + + """ + if isinstance(output, tuple): + hiddens = output[0] + elif isinstance(output, torch.Tensor): + hiddens = output + else: + raise TypeError + + + if not self.instantiated: + self.hidden_dim = hiddens.shape[-1] + logger.debug(f"Got hidden dim hidden_dim {self.hidden_dim}") + self.instantiate(hidden_dim=self.hidden_dim) + + + adapter_output = self.modulelist(hiddens) + modified_output = adapter_output + hiddens # TODO option: disable residual_connection + if isinstance(output, tuple): + output = (modified_output,) + output[1:] + elif isinstance(output, torch.Tensor): + output = modified_output + else: + raise TypeError + return output + + + +class AdapterConfig(BaseDeltaConfig): + r""" + This is the configuration class to store the configuration of a :py:class:`~AdapterModel` + + """ + def __init__( + self, + bottleneck_dim: Optional[int]=24, + non_linearity: Optional[str]='gelu_new', + sequential: Optional[str] = True, + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + + + +class AdapterModel(DeltaBase): + r""" The implementation of Adapter(`Parameter-Efficient Transfer Learning for NLP `_ ) . + Add adapter to the designated ``modified_modules``. In sequential paradigm, The modules' output is then passed into the adapter's + post_forward. + + .. note:: + We **assume** the output of the modified module is the hidden state or a tuple where hidden state is the + first element. This is true for most PLMs. However, we admit that currently it's not rigorous, We will improve + it in the next version. Currently, if you encount an error here for you backbone, you can modify the code to + get the hidden state. + + class attributes: + - default_modified_modules = ["attn", "ff"] According to the Adapter paper, we add adapter to the attention layer + and feed forward layer. + - delta_type = "adapter" + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + bottleneck_dim (:obj:`int`): The dimension of the adapter's bottleneck. + non_linearity (:obj:`str`): The non linearity of the adapter. + sequential (:obj:`str`): Whether insert the adapter in a sequential manner, as opposed to a parallel manner. + See `Towards a Unified View of Parameter-Efficient Transfer Learning `_ + for detail. + modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only + the implemented ones) + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen + together with the prefix parameters. + common_structure (:obj:`bool`): whether using name-based addressing witha common structure mapping. + + """ + config_class = AdapterConfig + delta_type = "adapter" + default_modified_modules = ["attn", "ff"] + def __init__(self, + backbone_model: nn.Module, + bottleneck_dim: Optional[int]=24, + non_linearity: Optional[str]='gelu_new', + sequential: Optional[str] = True, + modified_modules: Optional[bool] = None, + unfrozen_modules: Optional[bool] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + ): + DeltaBase.__init__(self, + backbone_model, + modified_modules=modified_modules, + unfrozen_modules=unfrozen_modules, + common_structure=common_structure, + interactive_modify=interactive_modify, + ) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + self.delta_modules = nn.ModuleList() + + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + + def add_all_delta_to_backbone(self, + module: nn.Module, + modified_modules: List[str], + ) -> nn.Module: + for key, _ in module.named_modules(): + if self.find_key(key, modified_modules): + self.update_module(module, key) + self._pseudo_data_to_instantiate(module) + self.mark_as_delta() + return module + + def update_module(self, module: nn.Module, key: str): + _, _, ref = self.find_module(module, key) + adapterlayer = self.new_module_like(ref) + self.insert_sequential_module(ref, delta_module=adapterlayer, name="adapter") + + def new_module_like(self, module): + module_device = get_device(module) + adapterlayer = AdapterLayer(bottleneck_dim=self.bottleneck_dim, non_linearity=self.non_linearity, device=module_device) + self.delta_modules.append(adapterlayer) + return adapterlayer + \ No newline at end of file diff --git a/opendelta/delta_models/bitfit.py b/opendelta/delta_models/bitfit.py new file mode 100644 index 0000000..9bdff02 --- /dev/null +++ b/opendelta/delta_models/bitfit.py @@ -0,0 +1,202 @@ +from typing import Optional, Union +from opendelta.utils.signature import get_arg_names_inside_func +from opendelta.utils.name_based_addressing import * +from opendelta.basemodel import DeltaBase, is_leaf_module +from transformers.models.t5 import T5ForConditionalGeneration +import loralib as lora +import torch.nn as nn + +from transformers.models.bert.modeling_bert import BertForMaskedLM +import torch +from torch.nn import init +import math +from opendelta.utils.structure_mapping import transform +from opendelta import BaseDeltaConfig +import opendelta.utils.logging as logging +logger = logging.get_logger(__name__) + + +class BitFitConfig(BaseDeltaConfig): + r""" + This is the configuration class to store the configuration of a :py:class:`~BitFitModel` + + """ + def __init__( + self, + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + +class BiasLayer(nn.Module): + def __init__(self, init_method="zero"): + super().__init__() + self.init_method=init_method + self.instantiated = False + + def instantiate(self, hidden_dim): + if self.init_method == "zero": + self.bias = nn.Parameter(torch.zeros(hidden_dim)) + else: + raise NotImplementedError + self.instantiated = True + + def post_forward(self, output): + r"""Presuming the first argument is the tensor to add bias along the last dimension. + In most cases, it is correct. However, be aware of the possibility that the presumption + doesn't hold. + """ + if isinstance(output, tuple): + hiddens = output[0] + elif isinstance(output, torch.Tensor): + hiddens = output + else: + raise TypeError + + if not self.instantiated: + self.hidden_dim = hiddens.shape[-1] + logger.debug(f"Got hidden dim hidden_dim {self.hidden_dim}") + self.instantiate(hidden_dim=self.hidden_dim) + + modified_output = hiddens + self.bias + + if isinstance(output, tuple): + output = (modified_output,) + output[1:] + elif isinstance(output, torch.Tensor): + output = modified_output + else: + raise TypeError + return output + + + +class BitFitModel(DeltaBase): + r""" The implementation of `BitFit: Simple Parameter-efficient Fine-tuning for Transformer-based Masked Language-models `_ . + Unfreeze bias term (or add bias term if bias term is absent in the backbone, e.g. T5) to the modules of + a transformer block. + + .. note:: + + **Broadcast to Submodule**: We modify all potential positions of the specified + ``modified_modules``. That is to say, if we specify ``attn`` in the modified_modules, then all position + including the q, k, v and out linear layer of the attention layer are added bias layer (or unfreezing). + The potential position is determined according to equation (1)-(5) and the previous three + equations. + + + class attributes: + - default_modified_modules = ["attn", "ff", "layer_norm","lm_head.proj"] According to the paper and the + implementation in `Compacter's baseline `_ , we modify the + bias term in the above modules. + - delta_type = "bitfit" + + + + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only + the implemented ones) + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen + together with the prefix parameters. + common_structure (:obj:`bool`): whether using name-based addressing witha common structure mapping. + + """ + + + config_class = BitFitConfig + delta_type = "bitfit" + default_modified_modules = ["attn", "ff", "layer_norm","lm_head.proj"] # modify all the bias parameter in attention and feed-forward layer. + def __init__(self, + backbone_model: nn.Module, + modified_modules: Optional[bool] = None, + unfrozen_modules: Optional[bool] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + ): + DeltaBase.__init__(self, + backbone_model, + modified_modules=modified_modules, + unfrozen_modules=unfrozen_modules, + common_structure=common_structure, + interactive_modify=interactive_modify, + ) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + self.delta_params = nn.ParameterList() + self.delta_modules = nn.ModuleList() + + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + + def update_module(self, module: nn.Module, key: str): + _, _, ref = self.find_module(module, key) + self.modify_module(ref) + + + def modify_module(self, + module: nn.Module, + ): + if is_leaf_module(module): + # if it is a leaf module, add bias to it regardless of its type. + if isinstance(module, nn.Linear): + self.add_bias_to_linear(module) + else: + # for example, layer_norms, lm_heads. + self.add_bias_to_others(module) + else: + # for the non-leaf modules, by default it will add bias only to the linear submodules. + for n, c in module.named_modules(): + if isinstance(c, nn.Linear): + if c.bias is None: + bias = nn.Parameter(torch.empty(c.out_features), requires_grad=True) + c.register_parameter('bias', bias) + self._reset_bias_parameters(c) + self.delta_params.append(bias) + else: + c.bias.requires_grad = True + self.delta_params.append(c.bias) + else: + pass + + def add_bias_to_linear(self, c): + if c.bias is None: + bias = nn.Parameter(torch.empty(c.out_features), requires_grad=True) + c.register_parameter('bias', bias) + self._reset_bias_parameters(c) + self.delta_params.append(bias) + else: + c.bias.requires_grad = True + self.delta_params.append(c.bias) + + def add_bias_to_others(self, c): + new_bias = BiasLayer() + self.insert_sequential_module(c, delta_module=new_bias, name="bitfit") # name shouldn't be `bias` here, since + # the name `bias` is reserved for some module such as roberta's LayerNorm. + self.delta_modules.append(new_bias) + + + + @staticmethod + def _reset_bias_parameters(linear_module): + fan_in, _ = init._calculate_fan_in_and_fan_out(linear_module.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + init.uniform_(linear_module.bias, -bound, bound) + + def detach(self, module): + r"""Not implemented for BitFit yet. Please wait for the next version. + """ + raise NotImplementedError + + def attach(self, module): + r"""Not implemented for BitFit yet. Please wait for the next version. + """ + raise NotImplementedError diff --git a/opendelta/delta_models/compacter.py b/opendelta/delta_models/compacter.py new file mode 100644 index 0000000..72c287a --- /dev/null +++ b/opendelta/delta_models/compacter.py @@ -0,0 +1,303 @@ +from functools import partial +from typing import Optional, Union +from opendelta.delta_configs import BaseDeltaConfig +from opendelta.utils.signature import get_arg_names_inside_func +from opendelta.utils.name_based_addressing import * +from opendelta.utils.cuda import get_device +from opendelta.basemodel import DeltaBase +import loralib as lora +import torch.nn as nn +import torch +import math +import opendelta +from opendelta.delta_models.layers.activations import Activations +import inspect +from opendelta.delta_models.layers.hypercomplex_linear import PHMLinear +import opendelta.utils.logging as logging +logger = logging.get_logger(__name__) + +class HyperComplexAdapterLayer(nn.Module): + """Hypercomplex Adapter layer, in which the weights of up and down sampler modules + are parameters are 1/n times of the conventional adapter layers, where n is + hypercomplex division number.""" + + def __init__(self, + reduction_factor=16, + non_linearity="relu", + phm_c_init="normal", + hypercomplex_division=4, + learn_phm=True, + hypercomplex_nonlinearity="glorot-uniform", + shared_phm_rule=False, + factorized_phm=True, + phm_rule: Optional[torch.Tensor]=None, + shared_W_phm=False, + factorized_phm_rule=False, + phm_rank=1, + phm_init_range=0.0001, + kronecker_prod=None, + device=None, + use_bias_up_sampler=True, + use_bias_down_sampler=True, + ): + super().__init__() + self.reduction_factor = reduction_factor + self.non_linearity = non_linearity + self.phm_c_init = phm_c_init + self.hypercomplex_division = hypercomplex_division + self.learn_phm = learn_phm + self.phm_rule=phm_rule + self.hypercomplex_nonlinearity = hypercomplex_nonlinearity + self.shared_phm_rule = shared_phm_rule + self.factorized_phm = factorized_phm + self.shared_W_phm = shared_W_phm + self.factorized_phm_rule = factorized_phm_rule + self.phm_rank = phm_rank + self.phm_init_range = phm_init_range + self.kronecker_prod = kronecker_prod + self.use_bias_up_sampler=use_bias_up_sampler + self.use_bias_down_sampler=use_bias_down_sampler + self.device = device + + self.instantiated = False + + + def instantiate(self, hidden_dim): + self.down_sample_size = hidden_dim // self.reduction_factor + self.activation = Activations(self.non_linearity.lower()).to(self.device) + self.down_sampler = PHMLinear(in_features=hidden_dim, + out_features=self.down_sample_size, + bias=self.use_bias_down_sampler, + c_init=self.phm_c_init, + phm_dim=self.hypercomplex_division, + phm_rule=self.phm_rule, + learn_phm=self.learn_phm, + w_init=self.hypercomplex_nonlinearity, + shared_phm_rule=self.shared_phm_rule, + factorized_phm=self.factorized_phm, + shared_W_phm=self.shared_W_phm, + factorized_phm_rule=self.factorized_phm_rule, + phm_rank=self.phm_rank, + phm_init_range=self.phm_init_range, + kronecker_prod=self.kronecker_prod).to(self.device) + self.up_sampler = PHMLinear(in_features=self.down_sample_size, + out_features=hidden_dim, + bias=self.use_bias_up_sampler, + c_init=self.phm_c_init, + phm_dim=self.hypercomplex_division, + phm_rule=self.phm_rule, + learn_phm=self.learn_phm, + w_init=self.hypercomplex_nonlinearity, + shared_phm_rule=self.shared_phm_rule, + factorized_phm=self.factorized_phm, + shared_W_phm=self.shared_W_phm, + factorized_phm_rule=self.factorized_phm_rule, + phm_rank=self.phm_rank, + phm_init_range=self.phm_init_range, + kronecker_prod=self.kronecker_prod).to(self.device) + self.instantiated = True + + + def post_forward(self, output): + r""" Get the hidden_states from the PLM's layer output, pass it into the hypercomplex adapter, + then combined with the main hidden_states. Finally pass it into the subsequent layer. + + """ + + if isinstance(output, tuple): + hiddens = output[0] + elif isinstance(output, torch.Tensor): + hiddens = output + else: + raise TypeError + + if not self.instantiated: + self.hidden_dim = hiddens.shape[-1] + logger.debug(f"Got hidden dim hidden_dim {self.hidden_dim}") + self.instantiate(hidden_dim=self.hidden_dim) + + + z = self.down_sampler(hiddens) + z = self.activation(z) + adapter_output = self.up_sampler(z) + + modified_output = adapter_output + hiddens # residual_connection + if isinstance(output, tuple): + output = (modified_output,) + output[1:] + elif isinstance(output, torch.Tensor): + output = modified_output + else: + raise TypeError + return output + +class CompacterConfig(BaseDeltaConfig): + r""" + This is the configuration class to store the configuration of a :py:class:`~CompacterModel` + + """ + def __init__( + self, + bottleneck_dim: Optional[int]=32, + non_linearity: Optional[str]='relu', + sequential: Optional[str] = True, + reduction_factor=16, + phm_c_init="normal", + hypercomplex_division=4, + learn_phm=True, + hypercomplex_nonlinearity="glorot-uniform", + shared_phm_rule=False, + factorized_phm=True, + shared_W_phm=False, + factorized_phm_rule=False, + phm_rank=1, + phm_init_range=0.0001, + kronecker_prod=None, + use_bias_up_sampler=True, + use_bias_down_sampler=True, + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + + + +class CompacterModel(DeltaBase): + r""" The implementation of `Compacter: Efficient Low-Rank Hypercomplex Adapter Layers `_ . + Add compacter layer to the designated ``modified_modules``. In sequential paradigm, The modules' output is then + passed into the compacter's post_forward. + + .. note:: + We **assume** the output of the modified module is the hidden state or a tuple where hidden state is the + first element. This is true for most PLMs. However, we admit that currently it's not rigorous, We will improve + it in the next version. Currently, if you encount an error here for you backbone, you can modify the code to + get the hidden state. + + All the hyperparameter is adopted from the `compacter code base `_ . + + class attributes: + - default_modified_modules = ["attn", "ff"] According to the compacter paper, we add compacter to the attention layer + and feed forward layer. + - delta_type = "compacter" + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only + the implemented ones) + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen + together with the prefix parameters. + common_structure (:obj:`bool`, *optional*, default to :obj:`None`): whether using name-based addressing witha common structure mapping. + reduction_factor (:obj:`int`, *optional*, default to ``16``): bottleneck_dim = hidden_dim//reduction_factor + non_linearity (:obj:`str`, *optional*, default to ``"gelu_new"``): The non linearity activation used in between the down + projecter and the up projecter. + phm_c_init (:obj:`str`, *optional*, default to ``"normal"``): The initialize method of the C in compacter. + hypercomplex_division (:obj:`str`, *optional*, default to 4): The ``n`` in the paper. The number of division along a dimension in compector. + learn_phm (:obj:`bool`, *optional*, default to :obj:`True` ): Whether the phm rule requires_grad. Note that we didn't check the performance of learn_phm=False. + hypercomplex_nonlinearity (:obj:`str`, *optional*, default to ``"glorot-uniform"``): The initialize method of the W in compacter. + shared_phm_rule (:obj:`str`, *optional* , default to :obj:`False`): Whether the phm rule is shared accross layer. + factorized_phm (:obj:`str`, *optional*, default to :obj:`True`): Whether to factorize the phm into low rank product. + shared_W_phm (:obj:`str`, *optional* , default to :obj:`False`): Whether the W_phm is shared accross layer. + factorized_phm_rule (:obj:`str`, *optional* , default to :obj:`False`): Whether to factorize the phm rule into low rank product. + phm_rank=1 (:obj:`int`, *optional*, default to 1): The rank of low rank decomposition of phm. + phm_init_range (:obj:`float`, *optional*, default to 0.0001): The range of phm initialization. + kronecker_prod (:obj:`bool`, *optional*, default to False): Whether to perform kronecker_prod in matvec_product, proposed by + `Parameterization of Hypercomplex Multiplications `_ + use_bias_up_sampler (:obj:`float`, *optional*, default to :obj:`True`): Whether add bias to the up projector. + Note that the bias for this is a ``hidden_dim`` vector. + use_bias_down_sampler (:obj:`float`, *optional*, default to :obj:`True`): Whether add bias to the down projector. + Note that the bias for this is a ``bottleneck_dim`` vector. + + + """ + config_class = CompacterConfig + delta_type = "compacter" + default_modified_modules = ["attn", "ff"] + def __init__(self, + backbone_model, + modified_modules: Optional[bool] = None, + unfrozen_modules: Optional[bool] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + reduction_factor=16, + non_linearity="gelu_new", + phm_c_init="normal", + hypercomplex_division=4, + learn_phm=True, + hypercomplex_nonlinearity="glorot-uniform", + shared_phm_rule=False, + factorized_phm=True, + shared_W_phm=False, + factorized_phm_rule=False, + phm_rank=1, + phm_init_range=0.0001, + kronecker_prod=None, + use_bias_up_sampler=True, + use_bias_down_sampler=True, + ): + DeltaBase.__init__(self, + backbone_model, + modified_modules=modified_modules, + unfrozen_modules=unfrozen_modules, + common_structure=common_structure, + interactive_modify=interactive_modify, + ) + assert shared_phm_rule == False, "In opendelta version {opendelta.__version__}, "\ + "shared_phm_rule is not supported. Later, sharing parameters will be tackled using"\ + "a unified paradigm." + assert shared_W_phm == False, "In opendelta version {opendelta.__version__}, "\ + "shared_W_phm is not supported. Later, sharing parameters will be tackled using"\ + "a unified paradigm." + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + self.delta_modules = nn.ModuleList() + + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + + def add_all_delta_to_backbone(self, + module: nn.Module, + modified_modules: List[str], + ) -> nn.Module: + for key, _ in module.named_modules(): + if self.find_key(key, modified_modules): + self.update_module(module, key) + self._pseudo_data_to_instantiate(module) + self.mark_as_delta() + return module + + def update_module(self, module: nn.Module, key: str): + _, _, ref = self.find_module(module, key) + adapterlayer = self.new_module_like(ref) + self.insert_sequential_module(ref, + delta_module=adapterlayer, + name="compactor") + + def new_module_like(self, module): + module_device = get_device(module) + adapterlayer = HyperComplexAdapterLayer(reduction_factor=self.reduction_factor, + non_linearity=self.non_linearity, + phm_c_init=self.phm_c_init, + hypercomplex_division=self.hypercomplex_division, + learn_phm=self.learn_phm, + hypercomplex_nonlinearity=self.hypercomplex_nonlinearity, + shared_phm_rule=self.shared_phm_rule, + factorized_phm=self.factorized_phm, + shared_W_phm=self.shared_W_phm, + factorized_phm_rule=self.factorized_phm_rule, + phm_rank=self.phm_rank, + phm_init_range=self.phm_init_range, + kronecker_prod=self.kronecker_prod, + use_bias_up_sampler=self.use_bias_up_sampler, + use_bias_down_sampler=self.use_bias_down_sampler, + device=module_device + ) + self.delta_modules.append(adapterlayer) + return adapterlayer + \ No newline at end of file diff --git a/opendelta/delta_models/layers/__init__.py b/opendelta/delta_models/layers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/opendelta/delta_models/layers/activations.py b/opendelta/delta_models/layers/activations.py new file mode 100644 index 0000000..8ce4a16 --- /dev/null +++ b/opendelta/delta_models/layers/activations.py @@ -0,0 +1,50 @@ +import torch +import math +import torch.nn as nn + +import torch.nn as nn +from transformers.activations import get_activation + +class Activations(nn.Module): + """ + Implementation of various activation function. Copied from open-source project AdapterHub #TODO: addlink + """ + + def __init__(self, activation_type): + self.activation_type = activation_type + if activation_type.lower() == "relu": + self.f = nn.functional.relu + elif activation_type.lower() == "tanh": + self.f = torch.tanh + elif activation_type.lower() == "swish": + + def swish(x): + return x * torch.sigmoid(x) + + self.f = swish + elif activation_type.lower() == "gelu_new": + + def gelu_new(x): + """ + Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). + Also see https://arxiv.org/abs/1606.08415 + """ + return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + + self.f = gelu_new + elif activation_type.lower() == "gelu_orig": + self.f = nn.functional.gelu + elif activation_type.lower() == "leakyrelu": + self.f = nn.functional.leaky_relu + else: + self.f = get_activation(activation_type) + + super().__init__() + + def forward(self, x): + return self.f(x) + + def __repr__(self): + return self.activation_type + + diff --git a/opendelta/delta_models/layers/hypercomplex_linear.py b/opendelta/delta_models/layers/hypercomplex_linear.py new file mode 100644 index 0000000..e0ed589 --- /dev/null +++ b/opendelta/delta_models/layers/hypercomplex_linear.py @@ -0,0 +1,213 @@ +# The codes are from https://github.com/bayer-science-for-a-better-life/phc-gnn +import torch +import torch.nn as nn +from typing import Union, Optional +import torch.nn.functional as F +import torch +import math +from opendelta.delta_models.layers.init import glorot_uniform, glorot_normal + + + +# The codes are from https://github.com/bayer-science-for-a-better-life/phc-gnn + +"""A part of the pylabyk library: numpytorch.py at https://github.com/yulkang/pylabyk""" +def kronecker_product(a, b): + """ + Kronecker product of matrices a and b with leading batch dimensions. + Batch dimensions are broadcast. The number of them mush + :type a: torch.Tensor + :type b: torch.Tensor + :rtype: torch.Tensor + """ + #return torch.stack([torch.kron(ai, bi) for ai, bi in zip(a,b)], dim=0) + siz1 = torch.Size(torch.tensor(a.shape[-2:]) * torch.tensor(b.shape[-2:])) + res = a.unsqueeze(-1).unsqueeze(-3) * b.unsqueeze(-2).unsqueeze(-4) + siz0 = res.shape[:-4] + out = res.reshape(siz0 + siz1) + return out + + +def kronecker_product_einsum_batched(A: torch.Tensor, B: torch.Tensor): + """ + Batched Version of Kronecker Products + :param A: has shape (b, a, c) + :param B: has shape (b, k, p) + :return: (b, ak, cp) + """ + assert A.dim() == 3 and B.dim() == 3 + res = torch.einsum('bac,bkp->bakcp', A, B).view(A.size(0), + A.size(1)*B.size(1), + A.size(2)*B.size(2)) + return res + + + +def matvec_product(W: torch.Tensor, x: torch.Tensor, + bias: Optional[torch.Tensor], + phm_rule, #: Union[torch.Tensor], + kronecker_prod=False) -> torch.Tensor: + """ + Functional method to compute the generalized matrix-vector product based on the paper + "Parameterization of Hypercomplex Multiplications (2020)" + https://openreview.net/forum?id=rcQdycl0zyk + y = Hx + b , where W is generated through the sum of kronecker products from the Parameterlist W, i.e. + W is a an order-3 tensor of size (phm_dim, in_features, out_features) + x has shape (batch_size, phm_dim*in_features) + phm_rule is an order-3 tensor of shape (phm_dim, phm_dim, phm_dim) + H = sum_{i=0}^{d} mul_rule \otimes W[i], where \otimes is the kronecker product + """ + if kronecker_prod: + H = kronecker_product(phm_rule, W).sum(0) + else: + H = kronecker_product_einsum_batched(phm_rule, W).sum(0) + + y = torch.matmul(input=x, other=H) + if bias is not None: + y += bias + return y + + +class PHMLinear(torch.nn.Module): + def __init__(self, + in_features: int, + out_features: int, + phm_dim: int, + phm_rule: Union[None, torch.Tensor] = None, + bias: bool = True, + w_init: str = "phm", + c_init: str = "random", + learn_phm: bool = True, + shared_phm_rule=False, + factorized_phm=False, + shared_W_phm=False, + factorized_phm_rule=False, + phm_rank = 1, + phm_init_range=0.0001, + kronecker_prod=False) -> None: + super(PHMLinear, self).__init__() + assert w_init in ["phm", "glorot-normal", "glorot-uniform", "normal"] + assert c_init in ["normal", "uniform"] + assert in_features % phm_dim == 0, f"Argument `in_features`={in_features} is not divisble be `phm_dim`{phm_dim}" + assert out_features % phm_dim == 0, f"Argument `out_features`={out_features} is not divisble be `phm_dim`{phm_dim}" + self.in_features = in_features + self.out_features = out_features + self.learn_phm = learn_phm + self.phm_dim = phm_dim + self._in_feats_per_axis = in_features // phm_dim + self._out_feats_per_axis = out_features // phm_dim + self.phm_rank = phm_rank + self.phm_rule = phm_rule + self.phm_init_range = phm_init_range + self.kronecker_prod=kronecker_prod + self.shared_phm_rule = shared_phm_rule + self.factorized_phm_rule = factorized_phm_rule + if not self.shared_phm_rule: + if self.factorized_phm_rule: + self.phm_rule_left = nn.Parameter(torch.FloatTensor(phm_dim, phm_dim, 1), + requires_grad=learn_phm) + self.phm_rule_right = nn.Parameter(torch.FloatTensor(phm_dim, 1, phm_dim), + requires_grad=learn_phm) + else: + self.phm_rule = nn.Parameter(torch.FloatTensor(phm_dim, phm_dim, phm_dim), + requires_grad=learn_phm) + self.bias_flag = bias + self.w_init = w_init + self.c_init = c_init + self.shared_W_phm = shared_W_phm + self.factorized_phm = factorized_phm + if not self.shared_W_phm: + if self.factorized_phm: + self.W_left = nn.Parameter(torch.Tensor(size=(phm_dim, self._in_feats_per_axis, self.phm_rank)), + requires_grad=True) + self.W_right = nn.Parameter(torch.Tensor(size=(phm_dim, self.phm_rank, self._out_feats_per_axis)), + requires_grad=True) + else: + self.W = nn.Parameter(torch.Tensor(size=(phm_dim, self._in_feats_per_axis, self._out_feats_per_axis)), + requires_grad=True) + if self.bias_flag: + self.b = nn.Parameter(torch.Tensor(out_features)) + else: + self.register_parameter("b", None) + self.reset_parameters() + + def init_W(self): + if self.w_init == "glorot-normal": + if self.factorized_phm: + for i in range(self.phm_dim): + self.W_left.data[i] = glorot_normal(self.W_left.data[i]) + self.W_right.data[i] = glorot_normal(self.W_right.data[i]) + else: + for i in range(self.phm_dim): + self.W.data[i] = glorot_normal(self.W.data[i]) + elif self.w_init == "glorot-uniform": + if self.factorized_phm: + for i in range(self.phm_dim): + self.W_left.data[i] = glorot_uniform(self.W_left.data[i]) + self.W_right.data[i] = glorot_uniform(self.W_right.data[i]) + else: + for i in range(self.phm_dim): + self.W.data[i] = glorot_uniform(self.W.data[i]) + elif self.w_init == "normal": + if self.factorized_phm: + for i in range(self.phm_dim): + self.W_left.data[i].normal_(mean=0, std=self.phm_init_range) + self.W_right.data[i].normal_(mean=0, std=self.phm_init_range) + else: + for i in range(self.phm_dim): + self.W.data[i].normal_(mean=0, std=self.phm_init_range) + else: + raise ValueError + + def reset_parameters(self): + if not self.shared_W_phm: + self.init_W() + + if self.bias_flag: + self.b.data = torch.zeros_like(self.b.data) + + if not self.shared_phm_rule: + if self.factorized_phm_rule: + if self.c_init == "uniform": + self.phm_rule_left.data.uniform_(-0.01, 0.01) + self.phm_rule_right.data.uniform_(-0.01, 0.01) + elif self.c_init == "normal": + self.phm_rule_left.data.normal_(std=0.01) + self.phm_rule_right.data.normal_(std=0.01) + else: + raise NotImplementedError + else: + if self.c_init == "uniform": + self.phm_rule.data.uniform_(-0.01, 0.01) + elif self.c_init == "normal": + self.phm_rule.data.normal_(mean=0, std=0.01) + else: + raise NotImplementedError + + def set_phm_rule(self, phm_rule=None, phm_rule_left=None, phm_rule_right=None): + """If factorized_phm_rules is set, phm_rule is a tuple, showing the left and right + phm rules, and if this is not set, this is showing the phm_rule.""" + if self.factorized_phm_rule: + self.phm_rule_left = phm_rule_left + self.phm_rule_right = phm_rule_right + else: + self.phm_rule = phm_rule + + def set_W(self, W=None, W_left=None, W_right=None): + if self.factorized_phm: + self.W_left = W_left + self.W_right = W_right + else: + self.W = W + + def forward(self, x: torch.Tensor, phm_rule: Union[None, nn.ParameterList] = None) -> torch.Tensor: + if self.factorized_phm: + W = torch.bmm(self.W_left, self.W_right) + if self.factorized_phm_rule: + phm_rule = torch.bmm(self.phm_rule_left, self.phm_rule_right) + return matvec_product( + W=W if self.factorized_phm else self.W, + x=x, + bias=self.b, + phm_rule=phm_rule if self.factorized_phm_rule else self.phm_rule, + kronecker_prod=self.kronecker_prod) \ No newline at end of file diff --git a/opendelta/delta_models/layers/init.py b/opendelta/delta_models/layers/init.py new file mode 100644 index 0000000..98a03e3 --- /dev/null +++ b/opendelta/delta_models/layers/init.py @@ -0,0 +1,8 @@ +import torch +import math + +def glorot_normal(tensor: torch.Tensor): + return torch.nn.init.xavier_normal_(tensor, gain=math.sqrt(2)) + +def glorot_uniform(tensor: torch.Tensor): + return torch.nn.init.xavier_uniform_(tensor, gain=math.sqrt(2)) diff --git a/opendelta/delta_models/layers/low_rank_linear.py b/opendelta/delta_models/layers/low_rank_linear.py new file mode 100644 index 0000000..61ab92d --- /dev/null +++ b/opendelta/delta_models/layers/low_rank_linear.py @@ -0,0 +1,39 @@ +"""This script implements a low-rank linear layer.""" +import torch +import torch.nn as nn + +from opendelta.delta_models.layers.init import glorot_uniform, glorot_normal + +class LowRankLinear(torch.nn.Module): + def __init__(self, input_dim: int, output_dim: int, rank: int = 1, + bias: bool = True, w_init: str = "glorot-uniform"): + super(LowRankLinear, self).__init__() + self.input_dim = input_dim + self.output_dim = output_dim + self.rank = rank + self.bias = bias + self.w_init = w_init + self.W_left = nn.Parameter(torch.Tensor(size=(input_dim, rank)), requires_grad=True) + self.W_right = nn.Parameter(torch.Tensor(size=(rank, output_dim)), requires_grad=True) + if bias: + self.b = nn.Parameter(torch.Tensor(output_dim)) + self.reset_parameters() + + def reset_parameters(self): + if self.bias: + self.b.data = torch.zeros_like(self.b.data) + if self.w_init == "glorot-uniform": + self.W_left.data = glorot_uniform(self.W_left.data) + self.W_right.data = glorot_uniform(self.W_right.data) + elif self.w_init == "glorot-normal": + self.W_left.data = glorot_normal(self.W_left.data) + self.W_right.data = glorot_normal(self.W_right.data) + else: + raise ValueError + + def forward(self, x: torch.Tensor) -> torch.Tensor: + W = self.W_left*self.W_right + output = torch.matmul(input=x, other=W) + if self.bias: + output += self.b + return output diff --git a/opendelta/delta_models/lora.py b/opendelta/delta_models/lora.py new file mode 100644 index 0000000..05af87e --- /dev/null +++ b/opendelta/delta_models/lora.py @@ -0,0 +1,127 @@ +from typing import Optional, Union + +from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func +from opendelta.utils.name_based_addressing import * +from opendelta.basemodel import DeltaBase +from transformers.models.t5 import T5ForConditionalGeneration +import loralib as lora +import torch.nn as nn +from opendelta import BaseDeltaConfig + +class LoraConfig(BaseDeltaConfig): + r""" + This is the configuration class to store the configuration of a :py:class:`~LoraModel` + + """ + def __init__( + self, + lora_r=8, + lora_alpha=16, + lora_dropout=0.0, + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + + + +class LoraModel(DeltaBase): + r""" The implementation of `LoRA: Low-Rank Adaptation of Large Language Models `_ . + Thanks for their `loralib `_, we use loralib.linear + to replace the linear layer of the backbone model. + + class attributes: + - default_modified_modules = ['attn.q', 'attn.v'] According to the paper, they modify q and v matrix in the + attention layer. However, other linears can also be modified, and may lead to better performance. + + .. note:: + modified_modules should point to linear layer. We currently don't support broadcast to all linears in + a module's child modules. + + - delta_type = "lora" + + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + lora_r (:obj:`int`, *optional*): the rank of the lora parameters. The smaller lora_r is , the fewer parameters lora has. + lora_alpha (:obj:`bool`, *optional*): A hyper-parameter to control the init scale of loralib.linear . + lora_dropout (:obj:`bool`, *optional*): The dropout rate in lora.linear. + modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only + the implemented ones) + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen + together with the prefix parameters. + common_structure (:obj:`bool`): whether using name-based addressing witha common structure mapping. + + """ + + config_class = LoraConfig + delta_type = "lora" + default_modified_modules = ['attn.q', 'attn.v'] + def __init__(self, + backbone_model: nn.Module, + lora_r=8, + lora_alpha=16, + lora_dropout=0.0, + modified_modules: Optional[bool] = None, + unfrozen_modules: Optional[bool] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + ): + DeltaBase.__init__(self, + backbone_model, + modified_modules=modified_modules, + unfrozen_modules=unfrozen_modules, + common_structure=common_structure, + interactive_modify=interactive_modify, + ) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + self.delta_modules = nn.ModuleList() + + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + + + def update_module(self, module: nn.Module, key: str): + parent_ref, child_name, child_ref = self.find_module(module, key) + new_module = self.new_module_like(child_module=child_ref) + self.replace_module(parent_ref, child_name, child_ref, new_module, delta_name="lora") + + def _pseudo_data_to_instantiate(self, module): + # no need to pass pseudo input, so overwrite it + pass + + def new_module_like(self, child_module): + if isinstance(child_module, nn.Linear): + in_features, out_features = child_module.in_features, child_module.out_features + new_module = lora.Linear(in_features=in_features, + out_features=out_features, + r=self.lora_r, + lora_alpha=self.lora_alpha, + lora_dropout=self.lora_dropout) + new_module.weight = child_module.weight + new_module.bias = child_module.bias # if bias is None, also copy + else: + raise NotImplementedError + return new_module + + + + def mark_as_delta(self, module: nn.Module = None): + if module is None: + module=self + for n, p in module.named_parameters(): + param_name = n.split(".")[-1] + if "lora_A" in param_name or "lora_B" in param_name: # only lora_A, lora_B is the delta parameter. + setattr(p, "_is_delta", True) + + + \ No newline at end of file diff --git a/opendelta/delta_models/low_rank_adapter.py b/opendelta/delta_models/low_rank_adapter.py new file mode 100644 index 0000000..b02fdb9 --- /dev/null +++ b/opendelta/delta_models/low_rank_adapter.py @@ -0,0 +1,208 @@ + +from opendelta.basemodel import DeltaBase +from opendelta.delta_configs import BaseDeltaConfig +from opendelta.delta_models.layers.low_rank_linear import LowRankLinear +from opendelta.delta_models.layers.activations import Activations +from typing import Optional, Union +from opendelta.utils.signature import get_arg_names_inside_func +import torch.nn as nn +import torch +from functools import partial +from typing import Optional +from opendelta.utils.name_based_addressing import * +from opendelta.utils.cuda import get_device +from opendelta.basemodel import DeltaBase +import loralib as lora +import torch.nn as nn +import torch +import math +import opendelta.utils.logging as logging +logger = logging.get_logger(__name__) + + +class LowRankAdapterConfig(BaseDeltaConfig): + r""" + This is the configuration class to store the configuration of a :py:class:`~LowRankAdapterModel` + + """ + def __init__( + self, + reduction_factor=32, + non_linearity="gelu_new", + low_rank_w_init="glorot-uniform", + low_rank_rank=1, + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + + + +class LowRankAdapter(nn.Module): + """This is the low-rank adapter, in which each adapter is composed of two rank-one matrices. + """ + def __init__(self, + reduction_factor=32, + non_linearity="gelu_new", + low_rank_w_init="glorot-uniform", + low_rank_rank=1, + device=None): + super().__init__() + self.reduction_factor = reduction_factor + self.non_linearity = non_linearity + self.low_rank_w_init = low_rank_w_init + self.low_rank_rank = low_rank_rank + self.device = device + self.instantiated = False + + + def instantiate(self, hidden_dim): + + self.down_sample_size = hidden_dim // self.reduction_factor + self.activation = Activations(self.non_linearity.lower()).to(self.device) + self.down_sampler = LowRankLinear(hidden_dim, self.down_sample_size, + w_init=self.low_rank_w_init, + rank=self.low_rank_rank).to(self.device) + self.up_sampler = LowRankLinear(self.down_sample_size, hidden_dim, + w_init=self.low_rank_w_init, + rank=self.low_rank_rank).to(self.device) + + self.instantiated = True + + def post_forward(self, output): + r""" Get the hidden_states from the PLM's layer output, pass it into the low-rank adapter, + then combined with the main hidden_states. Finally pass it into the subsequent layer. + + """ + + if isinstance(output, tuple): + hiddens = output[0] + elif isinstance(output, torch.Tensor): + hiddens = output + else: + raise TypeError + + if not self.instantiated: + self.hidden_dim = hiddens.shape[-1] + logger.debug(f"Got hidden dim hidden_dim {self.hidden_dim}") + self.instantiate(hidden_dim=self.hidden_dim) + + + z = self.down_sampler(hiddens) + z = self.activation(z) + adapter_output = self.up_sampler(z) + + modified_output = adapter_output + hiddens # residual_connection + if isinstance(output, tuple): + output = (modified_output,) + output[1:] + elif isinstance(output, torch.Tensor): + output = modified_output + else: + raise TypeError + return output + + + + + + +class LowRankAdapterModel(DeltaBase): + r""" The implementation of LowRankAdapter, proposed as a baseline in + `Compacter: Efficient Low-Rank Hypercomplex Adapter Layers `_ . + We found that it enjoys very few parameters but competitive performance, thus add it into OpenDelta. + Low Rank Adapter parameterize each adapter’s weight as a product of two rank-one(low) weights. + + Add lowrank adapter layer to the designated ``modified_modules``. In sequential paradigm, The modules' output is then + passed into the low rank adapter's post_forward. + + .. note:: + We **assume** the output of the modified module is the hidden state or a tuple where hidden state is the + first element. This is true for most PLMs. However, we admit that currently it's not rigorous, We will improve + it in the next version. Currently, if you encount an error here for you backbone, you can modify the code to + get the hidden state. + + All the hyperparameter is adopted from the `compacter code base `_ . + + class attributes: + - default_modified_modules = ["attn", "ff"] According to the compacter paper, we add low rank adapter to the attention layer + and feed forward layer. + - delta_type = "lowrankadapter" + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + reduction_factor (:obj:`int`, *optional*, default to ``16``): bottleneck_dim = hidden_dim//reduction_factor + non_linearity (:obj:`str`, *optional*, default to ``"gelu_new"``): The non linearity activation used in between the down + projecter and the up projecter. + low_rank_w_init (:obj:`str`, *optional*, default to ``"glorot-uniform"``): The weight init method of the factorized + linear weight. + low_rank_rank (:obj:`int`, *optional*, default to 1): The rank of the low-rank decomposition. + modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only + the implemented ones) + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen + together with the prefix parameters. + common_structure (:obj:`bool`, *optional*, default to :obj:`None`): whether using name-based addressing witha common structure mapping. + + """ + + config_class = LowRankAdapterConfig + delta_type = "lowrankadapter" + default_modified_modules = ['attn', 'ff'] + def __init__(self, + backbone_model: nn.Module, + reduction_factor = 32, + non_linearity = "gelu_new", + low_rank_w_init = "glorot-uniform", + low_rank_rank = 1, + modified_modules: Optional[bool] = None, + unfrozen_modules: Optional[bool] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + ): + DeltaBase.__init__(self, + backbone_model, + modified_modules=modified_modules, + unfrozen_modules=unfrozen_modules, + common_structure=common_structure, + interactive_modify=interactive_modify, + ) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + self.delta_modules = nn.ModuleList() + + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + + def add_all_delta_to_backbone(self, + module: nn.Module, + modified_modules: List[str], + ) -> nn.Module: + for key, _ in module.named_modules(): + if self.find_key(key, modified_modules): + self.update_module(module, key) + self._pseudo_data_to_instantiate(module) + self.mark_as_delta() + return module + + def update_module(self, module: nn.Module, key: str): + _, _, ref = self.find_module(module, key) + adapterlayer = self.new_module_like(ref) + self.insert_sequential_module(ref, delta_module=adapterlayer, name="low_rank_adapter") + + def new_module_like(self, module): + module_device = get_device(module) + adapterlayer = LowRankAdapter(reduction_factor = self.reduction_factor, + non_linearity = self.non_linearity, + low_rank_w_init = self.low_rank_w_init, + low_rank_rank = self.low_rank_rank, + device=module_device) + self.delta_modules.append(adapterlayer) + return adapterlayer + \ No newline at end of file diff --git a/opendelta/delta_models/prefix.py b/opendelta/delta_models/prefix.py new file mode 100644 index 0000000..debfd8e --- /dev/null +++ b/opendelta/delta_models/prefix.py @@ -0,0 +1,567 @@ +from functools import partial +from opendelta.delta_configs import BaseDeltaConfig +from opendelta.utils.signature import get_arg_names_inside_func, signature +from typing import Optional, Union +from transformers.models.distilbert.modeling_distilbert import MultiHeadSelfAttention +from transformers.models.t5.modeling_t5 import T5Attention, T5LayerSelfAttention +from transformers.models.bert.modeling_bert import BertSelfAttention +from transformers.models.gpt2.modeling_gpt2 import GPT2Attention +from transformers.models.bart.modeling_bart import BartAttention +from transformers.models.roberta.modeling_roberta import RobertaAttention +from opendelta.utils.name_based_addressing import * +from opendelta.utils.cuda import get_device +from opendelta.basemodel import DeltaBase +from transformers.models.t5 import T5ForConditionalGeneration +import loralib as lora +import torch.nn as nn +import torch +import opendelta.utils.logging as logging +logger = logging.get_logger(__name__) + + +class PrefixLayerT5(nn.Module): + r"""A layer of prefix tuning module. The layer's forward function pass (or concatenate) the additional past_key_value + into the original attention layer's forward function. + """ + def __init__(self, prefix_token_num, num_heads, device,): + super().__init__() + self.prefix_token_num = prefix_token_num + self.num_heads = num_heads + self.device = device + self.instantiated = False + + def instantiate(self, hidden_dim): + self.past_key = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_value = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_key_reparam = None + self.past_value_reparam = None + self.instantiated = True + + + def pre_forward(self, *args, **kwargs): + r"""The args and kwargs are inherited from the T5Attention's forward function. + """ + batch_size = args[0].shape[0] + seq_len = args[0].shape[-2] + if not self.instantiated: + self.hidden_dim = args[0].shape[-1] + self.instantiate(hidden_dim=self.hidden_dim) + if self.past_key_reparam is None: + past_key = self.past_key.data + else: + past_key = self.past_key_reparam + if self.past_value_reparam is None: + past_value = self.past_value.data + else: + past_value = self.past_value_reparam + + + def expand_batchsize(x): + x = x.reshape(self.prefix_token_num, self.num_heads, -1).transpose(0,1) + x = x.unsqueeze(0).expand(batch_size, *x.shape) + return x + + if 'position_bias' in kwargs and kwargs['position_bias'] is not None: + if kwargs['position_bias'].shape[-1] != seq_len + self.prefix_token_num: # Then the position_bias should be re-calculated + kwargs['position_bias'] = None + if kwargs['past_key_value'] is None: + kwargs['past_key_value'] = (expand_batchsize(past_key), expand_batchsize(past_value)) + + past_key_len = kwargs['past_key_value'][0].shape[-2] + + if 'mask' in kwargs and kwargs['mask'] is not None: + mask_len = kwargs['mask'].shape[-1] + if past_key_len + seq_len == mask_len + self.prefix_token_num: + + am = kwargs['mask'] # Should check the format of the attention_mask when moving to a new plm. + kwargs['mask'] = torch.cat([-torch.zeros((*am.shape[:-1],self.prefix_token_num), dtype = am.dtype,device=am.device), am], dim=-1) + return args, kwargs + + def post_forward(self, output): + r""" Remove the cached positional bias, since the next layer may not have prefix token. + """ + output = output[:2] + (None, )+ output[3:] + return output + +class PrefixLayerBart(nn.Module): + r"""A layer of prefix tuning module. The layer's forward function pass (or concatenate) the additional past_key_value + into the original attention layer's forward function. + """ + def __init__(self, prefix_token_num, num_heads, device,): + super().__init__() + self.prefix_token_num = prefix_token_num + self.num_heads = num_heads + self.device = device + self.instantiated = False + + def instantiate(self, hidden_dim): + self.past_key = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_value = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_key_reparam = None + self.past_value_reparam = None + self.instantiated = True + + + def pre_forward(self, *args, **kwargs): + r"""The args and kwargs are inherited from the T5Attention's forward function. + """ + + batch_size = kwargs['hidden_states'].shape[0] + if not self.instantiated: + self.hidden_dim = kwargs['hidden_states'].shape[-1] + self.instantiate(hidden_dim=self.hidden_dim) + if self.past_key_reparam is None: + past_key = self.past_key.data + else: + past_key = self.past_key_reparam + if self.past_value_reparam is None: + past_value = self.past_value.data + else: + past_value = self.past_value_reparam + + # from IPython import embed + # embed() + def expand_batchsize(x): + x = x.reshape(self.prefix_token_num, self.num_heads, -1).transpose(0,1) + x = x.unsqueeze(0).expand(batch_size, *x.shape) + return x + # from IPython import embe + + if 'past_key_value' not in kwargs or kwargs['past_key_value'] is None: + kwargs['past_key_value'] = (expand_batchsize(past_key), expand_batchsize(past_value)) + + if 'attention_mask' in kwargs and kwargs['attention_mask'] is not None: + am = kwargs['attention_mask'] # Should check the format of the attention_mask when moving to a new plm. + kwargs['attention_mask'] = torch.cat([-torch.zeros((*am.shape[:-1],self.prefix_token_num), dtype = am.dtype,device=am.device), am], dim=-1) + return args, kwargs + + +class PrefixLayerGPT2(nn.Module): + r"""A layer of prefix tuning module. The layer's forward function pass (or concatenate) the additional past_key_value + into the original attention layer's forward function. + """ + def __init__(self, prefix_token_num, num_heads, device,): + super().__init__() + self.prefix_token_num = prefix_token_num + self.num_heads = num_heads + self.device = device + self.instantiated = False + + def instantiate(self, hidden_dim): + self.past_key = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_value = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_key_reparam = None + self.past_value_reparam = None + self.instantiated = True + + + def pre_forward(self, *args, **kwargs): + r"""The args and kwargs are inherited from the T5Attention's forward function. + """ + batch_size = args[0].shape[0] + if not self.instantiated: + self.hidden_dim = args[0].shape[-1] + self.instantiate(hidden_dim=self.hidden_dim) + if self.past_key_reparam is None: + past_key = self.past_key.data + else: + past_key = self.past_key_reparam + if self.past_value_reparam is None: + past_value = self.past_value.data + else: + past_value = self.past_value_reparam + + def expand_batchsize(x): + x = x.reshape(self.prefix_token_num, self.num_heads, -1).transpose(0,1) + x = x.unsqueeze(0).expand(batch_size, *x.shape) + return x + + + if kwargs['layer_past'] is None: + kwargs['layer_past'] = (expand_batchsize(past_key), expand_batchsize(past_value)) + if 'attention_mask' in kwargs and kwargs['attention_mask'] is not None: + am = kwargs['attention_mask'] # Should check the format of the attention_mask when moving to a new plm. + kwargs['attention_mask'] = torch.cat([-torch.zeros((*am.shape[:-1],self.prefix_token_num), dtype = am.dtype,device=am.device), am], dim=-1) + return args, kwargs + + + +class PrefixLayerDistilBert(nn.Module): + # TODO: Warning: have bugs + def __init__(self, prefix_token_num, device,): + super().__init__() + self.prefix_token_num = prefix_token_num + self.device = device + self.key_instantiated = False + self.value_instantiated = False + + def forward(self, *args, **kwargs): + mask = kwargs["mask"] + key, value = kwargs['key'], kwargs['value'] + prefix_mask = torch.ones(mask.shape[0], self.prefix_token_num, dtype=mask.dtype, device=mask.device) + concated_mask = torch.cat([prefix_mask, mask], dim=1) + pseudo_prefix = torch.zeros(key.shape[0], self.prefix_token_num, key.shape[2], dtype=key.dtype, device=key.device) + concated_key = torch.cat([pseudo_prefix, key], dim=1) + concated_value = torch.cat([pseudo_prefix, value], dim=1) + kwargs["mask"] = concated_mask + kwargs['key'] = concated_key + kwargs['value'] = concated_value + return args, kwargs + + + def key_instantiate(self, hidden_dim): + self.past_key = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_key_reparam = None + self.key_instantiated = True + + def value_instantiate(self, hidden_dim): + self.past_value = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_value_reparam = None + self.value_instantiated = True + + def key_pre_forward(self, *args, **kwargs): + _input = args[0] + _input = _input[:,self.prefix_token_num:, :] + args = (_input,) +args[1:] + return args, kwargs + + def value_pre_forward(self, *args, **kwargs): + _input = args[0] + _input = _input[:,self.prefix_token_num:, :] + args = (_input,) +args[1:] + return args, kwargs + + def key_forward(self, output: torch.Tensor): ### Check whether run prefix is ok, 12.21 + if isinstance(output, torch.Tensor): + hiddens = output + else: + raise TypeError + if not self.key_instantiated: + self.hidden_dim = hiddens.shape[-1] + logger.debug(f"Got key hidden dim hidden_dim {self.hidden_dim}") + self.key_instantiate(hidden_dim=self.hidden_dim) + batch_size = hiddens.shape[0] + if self.past_key_reparam is None: + past_key = self.past_key.data + else: + past_key = self.past_key_reparam + output = torch.cat([past_key.unsqueeze(0).expand(batch_size, *past_key.shape), hiddens], dim=1) + return output + + def value_forward(self, output: torch.Tensor): ### Check whether run prefix is ok, 12.21 + if isinstance(output, torch.Tensor): + hiddens = output + else: + raise TypeError + if not self.value_instantiated: + self.hidden_dim = hiddens.shape[-1] + logger.debug(f"Got value hidden dim hidden_dim {self.hidden_dim}") + self.value_instantiate(hidden_dim=self.hidden_dim) + batch_size = hiddens.shape[0] + if self.past_value_reparam is None: + past_value = self.past_value.data + else: + past_value = self.past_value_reparam + output = torch.cat([past_value.unsqueeze(0).expand(batch_size, *past_value.shape), hiddens], dim=1) + return output + + +class PrefixLayerRoberta(nn.Module): + r"""A layer of prefix tuning module. The layer's forward function pass (or concatenate) the additional past_key_value + into the original attention layer's forward function. + """ + def __init__(self, prefix_token_num, num_heads, device,): + super().__init__() + self.prefix_token_num = prefix_token_num + self.num_heads = num_heads + self.device = device + self.instantiated = False + + def instantiate(self, hidden_dim): + self.past_key = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_value = nn.Parameter(torch.randn(self.prefix_token_num, hidden_dim, device=self.device), requires_grad=True) + self.past_key_reparam = None + self.past_value_reparam = None + self.instantiated = True + + + def pre_forward(self, *args, **kwargs): + r"""The args and kwargs are inherited from the T5Attention's forward function. + """ + batch_size = args[0].shape[0] + if not self.instantiated: + self.hidden_dim = args[0].shape[-1] + self.instantiate(hidden_dim=self.hidden_dim) + if self.past_key_reparam is None: + past_key = self.past_key.data + else: + past_key = self.past_key_reparam + if self.past_value_reparam is None: + past_value = self.past_value.data + else: + past_value = self.past_value_reparam + + # from IPython import embed + # embed() + def expand_batchsize(x): + x = x.reshape(self.prefix_token_num, self.num_heads, -1).transpose(0,1) + x = x.unsqueeze(0).expand(batch_size, *x.shape) + return x + # from IPython import embe + + if 'past_key_value' not in kwargs or kwargs['past_key_value'] is None: + kwargs['past_key_value'] = (expand_batchsize(past_key), expand_batchsize(past_value)) + + if 'attention_mask' in kwargs and kwargs['attention_mask'] is not None: + am = kwargs['attention_mask'] # Should check the format of the attention_mask when moving to a new plm. + kwargs['attention_mask'] = torch.cat([-torch.zeros((*am.shape[:-1],self.prefix_token_num), dtype = am.dtype,device=am.device), am], dim=-1) + elif len(args) >1: # attention mask is passed via positional argument + am = args[1] + am = torch.cat([-torch.zeros((*am.shape[:-1],self.prefix_token_num), dtype = am.dtype,device=am.device), am], dim=-1) + args = (args[0], am) + args[2:] + # from IPython import embed + # embed(header = "Herein prefixroberta") + return args, kwargs + + + + # def post_forward(self, output): + # r""" Remove the cached positional bias, since the next layer may not have prefix token. + # """ + # output = output[:2] + (None, )+ output[3:] + # return output + + +class ReparameterizeFunction(nn.Module): + r""" Prefix Tuning's performance is better with a reparameterize module, which generates + the ``past_key_value`` using an MLP instead of directly optimizing the ``past_key_value`` as leaf variable. + In our implementation, the reparameterize module is constructed according to the number of parameters + in all ``past_key_value``s. Thus, variable number of prefixlayer is supported (not restricting to being equal + to the number of layers of the pretraind language model) + + + """ + def __init__(self, prefix_token_num, embed_dim, dropout_rate=0.0, mid_dim=512, module_list=[]): + super().__init__() + self.prefix_token_num = prefix_token_num + self.embed_dim = embed_dim + self.mid_dim = mid_dim + self.module_list = module_list + self.dropout = nn.Dropout(dropout_rate) + self.record_parameters() + self.compatibility_check() + self.define_reparameterization_network() + + def record_parameters(self): + r""" Enumerate the parameters that need to be reparameterized. + Then, delete the original parameters. + """ + tot = 0 + for module in self.module_list: + for n, parameters in module.named_parameters(): + tot += parameters.numel() + module.register_parameter(n, None) + self.total_parameters_num = tot + + def compatibility_check(self,): + r"""May be removed. + """ + assert self.total_parameters_num % self.prefix_token_num == 0 + + def allocate_parameter(self): + r""" At the beginning of each forward pass through the whole network(PLM), + cacalulate the reparameterized past_key and past_value (``past_key_reparam`` and ``past_value_reparam``) + for later use in each layer. + """ + input_tokens = self.input_tokens + temp_control = self.wte(input_tokens) + past_key_values = self.control_trans(temp_control) + seqlen, _ = past_key_values.shape + + past_key_values = past_key_values.view(seqlen, len(self.module_list) * 2, self.module_list[0].hidden_dim) + past_key_values = self.dropout(past_key_values) + past_key_values = past_key_values.permute([1, 0, 2]).split(2) + + for module_id, module in enumerate(self.module_list): + module.past_key_reparam = past_key_values[module_id][0] + module.past_value_reparam = past_key_values[module_id][1] + + def pre_forward(self, *args, **kwargs): + r""" Firstly forward through the reparameterized network, and then go through normal forward pass of the PLM. + """ + self.allocate_parameter() + return args, kwargs + + def define_reparameterization_network(self) -> None: + r""" Build the reparameterize module + """ + self.input_tokens = nn.Parameter(torch.arange(self.prefix_token_num).long(), requires_grad=False) # to allow automatic devicing + self.wte = nn.Embedding(self.prefix_token_num, self.embed_dim) + self.control_trans = nn.Sequential( + nn.Linear(self.embed_dim, self.mid_dim), + nn.Tanh(), + nn.Linear(self.mid_dim, self.total_parameters_num//self.prefix_token_num) + ) + + +class PrefixConfig(BaseDeltaConfig): + def __init__( + self, + prefix_token_num=6, + reparameterize=True, + embed_dim: Optional[int]=512, + mid_dim: Optional[int]=512, + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + + + + + +class PrefixModel(DeltaBase): + r""" The implementation of `Prefix-Tuning: Optimizing Continuous Prompts for Generation `_ . + However, as attention block of different PLM differs substantially, e.g., the input arguments, the name convention + of ``past_key_value``, we have to implement different prefixlayer for different PLM. Given the inconvenience in the + code level, we only support several commonly used backbone models (Currently: T5, DistilBert,Bert, Roberta, GPT2, + BART). If you are trying to apply delta tuning to other backbone models, we suggest you trying other delta models + or implementing it and making a pull request. + + Experimental Feature: + + Support inserting prefix token before each layer. For example, layer 3 4 6 10 and other layer untouched. + + .. note:: + + If using reparameterize, the parameters will be in a reparameterization network, not in the prefix, which + we attach to the first prefix layer. We will add a function to save only the generated prefix parameters for + saving in the next version. + + + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + prefix_token_num (:obj:`int`): the number of prefix token + reparameterize (:obj:`bool`): Whether use the reparameterization for prefix tuning. + embed_dim (:obj:`int`): The embeding dimension of prefix token when using the reparameterization. + mid_dim (:obj:`int`): The dimension of the hiddens of the reparameterization network. + modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only + the implemented ones) + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen + together with the prefix parameters. + common_structure (:obj:`bool`): whether using name-based addressing witha common structure mapping. + + """ + config_class = PrefixConfig + delta_type = "prefix" + default_modified_modules = ['attn'] + def __init__(self, + backbone_model: nn.Module, + prefix_token_num=6, + reparameterize=True, + embed_dim: Optional[int]=512, + mid_dim: Optional[int]=512, + modified_modules: Optional[List[str]] = None, + unfrozen_modules: Optional[List[str]] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + ): + DeltaBase.__init__(self, + backbone_model, + modified_modules=modified_modules, + unfrozen_modules=unfrozen_modules, + common_structure=common_structure, + interactive_modify=interactive_modify, + ) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + self.delta_modules = nn.ModuleList() + + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + def add_all_delta_to_backbone(self, + module: nn.Module, + modified_modules: List[str], + ) -> nn.Module: + first_modified_module = None + # Current, We assume the layerer are in order in named_modules. + # Thus the first modified module is the first module that the tensor flows to. + for key, _ in module.named_modules(): + if self.find_key(key, modified_modules): + logger.debug("find key {}".format(key)) + if first_modified_module is None: + _, _, ref = self.find_module(module, key) + first_modified_module = ref + self.update_module(module, key) + + self._pseudo_data_to_instantiate(module) + + if self.reparameterize: + reparams = ReparameterizeFunction(prefix_token_num=self.prefix_token_num, + embed_dim=self.embed_dim, + mid_dim=self.mid_dim, + module_list=self.delta_modules) + self.delta_modules = None + self.reparams = reparams + self.insert_sequential_module(first_modified_module, delta_module=reparams, name="reparams", strict=False) + self.mark_as_delta() + return module + + + + def update_module(self, module: nn.Module, key: str): + _, _, ref = self.find_module(module, key) + + prefixlayer, ref = self.new_module_like(ref) + self.insert_sequential_module(ref, delta_module=prefixlayer, name="prefix") + self.delta_modules.append(prefixlayer) + + def new_module_like(self, module): + # TODO: support more Attention modules + + if isinstance(module, T5Attention) or isinstance(module, T5LayerSelfAttention): + if isinstance(module, T5LayerSelfAttention): + module = module.SelfAttention # innermodule + module_device = get_device(module) + prefixlayer = PrefixLayerT5(prefix_token_num=self.prefix_token_num, num_heads=module.n_heads ,device=module_device) + elif isinstance(module, MultiHeadSelfAttention): # MultiHeadSelfAttention didn't provide past_key_value in the interface of the forward function. + module_device = get_device(module) + prefixlayer = PrefixLayerDistilBert(prefix_token_num=self.prefix_token_num, device=module_device) + self.insert_sequential_module(getattr(module, "k_lin"), pre_caller=prefixlayer.key_pre_forward, post_caller=prefixlayer.key_forward) + self.insert_sequential_module(getattr(module, "v_lin"), pre_caller=prefixlayer.value_pre_forward, post_caller=prefixlayer.value_forward) + elif isinstance(module, BertSelfAttention): + raise NotImplementedError + elif isinstance(module, RobertaAttention): + module_device = get_device(module) + prefixlayer = PrefixLayerRoberta(prefix_token_num=self.prefix_token_num, num_heads=module.self.num_attention_heads,device=module_device) + elif isinstance(module, GPT2Attention): + module_device = get_device(module) + prefixlayer = PrefixLayerGPT2(prefix_token_num=self.prefix_token_num, num_heads=module.num_heads ,device=module_device) + elif isinstance(module, BartAttention): + module_device = get_device(module) + prefixlayer = PrefixLayerBart(prefix_token_num=self.prefix_token_num, num_heads=module.num_heads ,device=module_device) + else: + raise NotImplementedError(type(module)) + return prefixlayer, module + + + + + + + + + + + + + \ No newline at end of file diff --git a/opendelta/delta_models/soft_prompt.py b/opendelta/delta_models/soft_prompt.py new file mode 100644 index 0000000..0d2fd21 --- /dev/null +++ b/opendelta/delta_models/soft_prompt.py @@ -0,0 +1,209 @@ +from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func +from opendelta.utils.name_based_addressing import * +from opendelta.utils.cuda import get_device +from opendelta.basemodel import DeltaBase +from typing import * +import torch +import torch.nn as nn +from opendelta import BaseDeltaConfig +from decorator import decorate +import torch.nn.functional as F +from opendelta import logging +logger = logging.get_logger(__name__) + +class SoftPromptConfig(BaseDeltaConfig): + r""" + This is the configuration class to store the configuration of a :py:class:`SoftPromptModel` + + """ + def __init__( + self, + soft_token_num=100, + init_range = 0.5, + token_init = True, + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + + + +class SoftPromptLayer(nn.Module): + r"""This is the implementation of `The Power of Scale for Parameter-Efficient + Prompt Tuning `_ . Similar to :obj:`PrefixTuningTemplate`, + This template also does not need any textual template. Addition tokens are directly + concatenated into the input ids. There are two initializations of the new tokens. + (1). random initialization. (2) initialize with the tokens of the plm (We simply take + the first n_tokens similar to their implementation). + + Note that this template can be simply achieved by :obj:`SoftManualTemplate`, in which + you set ``n_token`` tokens template before the will give the same result. + """ + + def __init__(self, + soft_token_num: int = 100, + raw_embedding: Optional[torch.Tensor] = None, + init_range: Optional[float] = 0.5, + token_init = False, + pad_id = 0, + device: Optional[str]=None, + ): + super().__init__() + self.__dict__['raw_embedding'] = raw_embedding + + self.init_range = init_range + self.num_tokens = soft_token_num + self.pad_id = pad_id + self.token_init = token_init + self.device = device + + assert self.num_tokens>0 + self.instantiate(raw_embedding(torch.tensor([0])).shape[-1]) + + def pre_forward(self, *args, **kwargs): + # if attention_mask is passed as PLM's input, modify it here + if 'encoder_outputs' in kwargs and kwargs['encoder_outputs'] is not None: + # In generation, the input is forward through the model again. + return args, kwargs + + if 'input_ids' in kwargs: + input_ids = kwargs['input_ids'] + kwargs['input_ids'] = None + elif len(args) > 0: + input_ids = args[0] + args = args[1:] + else: + input_ids = None + + + if 'attention_mask' not in kwargs or kwargs['attention_mask'] is None: + # infer attention mask + if input_ids is None: + raise RuntimeError("no input ids found") + kwargs['attention_mask'] = (input_ids != self.pad_id).to(torch.int64) + + if 'inputs_embeds' not in kwargs or kwargs['inputs_embeds'] is None: + try: + inputs_embeds = self.raw_embedding(input_ids) + except: + raise RuntimeError("neither inputs_embeds nor input_ids is specified.") + else: + inputs_embeds = kwargs['inputs_embeds'] + + + + batch_size = inputs_embeds.size(0) + soft_embeds = self.soft_embeds.repeat(batch_size, 1, 1) + inputs_embeds = torch.cat([soft_embeds, inputs_embeds], 1) + kwargs['inputs_embeds'] = inputs_embeds + + am = kwargs['attention_mask'] + am.data = torch.cat([torch.ones((*am.shape[:-1], inputs_embeds.shape[-2]-am.shape[-1]), dtype = am.dtype,device=am.device), am], dim=-1) + + return args, kwargs + + def instantiate(self, hidden_dim) -> None: + """ + generate parameters needed for soft tokens embedding in soft-prompt + for soft tokens, use a new embedding layer which is initialized with their corresponding embedding of hard tokens + """ + soft_embeds = torch.FloatTensor(self.num_tokens, hidden_dim) + if self.token_init: + soft_embeds.data = torch.clone(self.raw_embedding(torch.tensor([i for i in range(self.num_tokens)]))) + else: + soft_embeds = soft_embeds.uniform_(-self.init_range, self.init_range) + + self.soft_embeds = nn.Parameter(soft_embeds, requires_grad=True).to(self.device) + + +class SoftPromptModel(DeltaBase): + r""" + This is the implementation of `The Power of Scale for Parameter-Efficient + Prompt Tuning `_ . Similar to :obj:`PrefixTuningTemplate`, + This template also does not need any textual template. Addition tokens are directly + concatenated into the input ids. There are two initializations of the new tokens. + (1). random initialization. (2) initialize with the tokens of the plm (We simply take + the first n_tokens similar to their implementation). + + Note that this template can be simply achieved by :obj:`SoftManualTemplate`, in which + you set ``n_token`` tokens template before the will give the same result. + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + soft_token_num (:obj:`int`, *optional*): num of new tokens to add in the front of the input. + init_range (:obj:`bool`, *optional*): If initialize new tokens randomly, the random range of uniform distribution. + token_init (:obj:`bool`, *optional*, default to :obj:`True`): Whether to initialize the new tokens with tokens of the plm + modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only + the implemented ones) + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen + together with the prefix parameters. + common_structure (:obj:`bool`): whether using name-based addressing witha common structure mapping. + + """ + config_class = SoftPromptConfig + delta_type = "soft_prompt" + default_modified_modules = ["root"] # not used + def __init__(self, + backbone_model: nn.Module, + soft_token_num=100, + init_range = 0.5, + token_init=True, + modified_modules: Optional[bool] = None, + unfrozen_modules: Optional[bool] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + ): + DeltaBase.__init__(self, + backbone_model = backbone_model, + modified_modules = ["root"], + unfrozen_modules = unfrozen_modules, + common_structure = False, + interactive_modify = interactive_modify, + ) + + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + + try: + self.__dict__['raw_embedding'] = self.backbone_model.get_input_embeddings() + except AttributeError: + raise AttributeError(f"'{type(self.backbone_model)}' object has no attribute 'get_input_embeddings', please pass "+ + "input embeddings into 'self.raw_embedding' in user-specific ways.") + + self.delta_modules = nn.ModuleList() + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + def add_all_delta_to_backbone(self, + module: nn.Module, + modified_modules: List[str], + ) -> nn.Module: + self.update_module() + self.mark_as_delta() + return module + + def update_module(self): + soft_prompt_layer = self.new_module_like(self.raw_embedding) + self.insert_sequential_module(self.backbone_model.get_encoder() if self.backbone_model.config.is_encoder_decoder else self.backbone_model, + delta_module=soft_prompt_layer, + name="soft_prompt_layer" ) + + def new_module_like(self, module): + module_device = get_device(module) + soft_prompt_layer = SoftPromptLayer( + soft_token_num = self.soft_token_num, + raw_embedding = self.raw_embedding, + token_init = self.token_init, + init_range = self.init_range, + device = module_device, + ) + self.delta_modules.append(soft_prompt_layer) + return soft_prompt_layer + \ No newline at end of file diff --git a/opendelta/utils/__init__.py b/opendelta/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/opendelta/utils/cuda.py b/opendelta/utils/cuda.py new file mode 100644 index 0000000..8f9d535 --- /dev/null +++ b/opendelta/utils/cuda.py @@ -0,0 +1,58 @@ +from typing import Union +import torch.nn as nn + +def get_device(module : Union[nn.Module, nn.Parameter]): + if not (isinstance(module, nn.Module) \ + or isinstance(module, nn.Parameter)): + raise RuntimeError("module is not a instance of torch.nn.Module") + if hasattr(module, 'device'): + return module.device + else: + params_devices = [p.device for p in module.parameters()] + if len(params_devices) == 0: + return None + elif len(set(params_devices))==1: + return params_devices[0] + else: + raise RuntimeError("The module is paralleled acrossed device, please get device in a inner module") + + +# unitest, should be removed later +if __name__ == "__main__": + import torch + import torch.nn as nn + + a = nn.Parameter(torch.randn(3,5)) + + class MyNet(nn.Module): + def __init__(self): + super().__init__() + + class MyNet2(nn.Module): + def __init__(self): + super().__init__() + self.l1 = nn.Linear(3,5).to('cuda:2') + self.l2 = nn.Linear(3,5).to('cuda:2') + + class MyNet3(nn.Module): + def __init__(self): + super().__init__() + self.l1 = nn.Linear(3,5).to('cuda:3') + self.l2 = nn.Linear(3,5).cuda() + + class MyNet4: + pass + + b = MyNet() + c = MyNet2() + d = MyNet3() + e = MyNet4() + + print(get_device(a)) + print(get_device(b)) + print(get_device(c)) + print(get_device(e)) + print(get_device(d)) + + + diff --git a/opendelta/utils/data_parallel.py b/opendelta/utils/data_parallel.py new file mode 100644 index 0000000..973d21f --- /dev/null +++ b/opendelta/utils/data_parallel.py @@ -0,0 +1,39 @@ +# This utils is used to support Using pytorch's native DataParallel method, +# which create several backbone model inside DataParallel. +# DistributedDataParallel doesn't need this function. +from opendelta.utils.decorate import decorate +from collections import OrderedDict + +def new_replicate_for_data_parallel(self): + r""" self is the parent module. + """ + # rewrite the replicate in DataParallel. + def _caller(_org_func, org_module, delta_name, *args, **kwargs): + args = args[1:] # the first argument here is ``self`` + delta_module = getattr(org_module, delta_name) + if hasattr(delta_module, "pre_forward"): + args, kwargs = delta_module.pre_forward(*args, **kwargs) + ret = _org_func(*args, **kwargs) + if hasattr(delta_module, "post_forward"): + ret = delta_module.post_forward(ret) + return ret + replica = self.__new__(type(self)) + org_forward = replica.forward + replica.__dict__ = self.__dict__.copy() + assert replica.forward != org_forward + replica.__dict__['forward'] = org_forward + + + for _delta_info in self._delta_infos: + if _delta_info['method'] == "insert_sequential" and _delta_info['state'] == "on": + new_forward = decorate(replica.forward, _caller, extras=(replica, _delta_info['delta_name']), kwsyntax=True) + replica.__dict__['forward'] = new_forward.__get__(replica, type(replica)) + + # replicas do not have parameters themselves, the replicas reference the original + # module. + replica._parameters = OrderedDict() + replica._buffers = replica._buffers.copy() + replica._modules = replica._modules.copy() + replica._is_replica = True + + return replica \ No newline at end of file diff --git a/opendelta/utils/decorate.py b/opendelta/utils/decorate.py new file mode 100644 index 0000000..d8782e5 --- /dev/null +++ b/opendelta/utils/decorate.py @@ -0,0 +1,75 @@ +# copied and modified from decorator.decorate + +import re +import sys +import inspect +import operator +import itertools +from contextlib import _GeneratorContextManager +from inspect import getfullargspec, iscoroutinefunction, isgeneratorfunction + +def fix(args, kwargs, sig): + """ + Fix args and kwargs to be consistent with the signature + """ + ba = sig.bind(*args, **kwargs) + ba.apply_defaults() # needed for test_dan_schult + return ba.args, ba.kwargs + + +def decorate(func, caller, extras=(), kwsyntax=False): + """ + Decorates a function/generator/coroutine using a caller. + If kwsyntax is True calling the decorated functions with keyword + syntax will pass the named arguments inside the ``kw`` dictionary, + even if such argument are positional, similarly to what functools.wraps + does. By default kwsyntax is False and the the arguments are untouched. + + **The difference between this function and decorator.decorate is that + is support nested decorate. + """ + sig = inspect.signature(func) + if iscoroutinefunction(caller): + async def fun(*args, **kw): + if not kwsyntax: + args, kw = fix(args, kw, sig) + return await caller(func, *(extras + args), **kw) + elif isgeneratorfunction(caller): + def fun(*args, **kw): + if not kwsyntax: + args, kw = fix(args, kw, sig) + for res in caller(func, *(extras + args), **kw): + yield res + else: + def fun(*args, **kw): + if not kwsyntax: + args, kw = fix(args, kw, sig) + return caller(func, *(extras + args), **kw) + fun.__name__ = func.__name__ + fun.__doc__ = func.__doc__ + __wrapped__ = func # support nested wrap + fun.__signature__ = sig + fun.__qualname__ = func.__qualname__ + # builtin functions like defaultdict.__setitem__ lack many attributes + try: + fun.__defaults__ = func.__defaults__ + except AttributeError: + pass + try: + fun.__kwdefaults__ = func.__kwdefaults__ + except AttributeError: + pass + try: + fun.__annotations__ = func.__annotations__ + except AttributeError: + pass + try: + fun.__module__ = func.__module__ + except AttributeError: + pass + try: + fun.__dict__.update(func.__dict__) + except AttributeError: + pass + fun.__wrapped__ = __wrapped__ # support nested wrap + return fun \ No newline at end of file diff --git a/opendelta/utils/delta_hub.py b/opendelta/utils/delta_hub.py new file mode 100644 index 0000000..d0da33e --- /dev/null +++ b/opendelta/utils/delta_hub.py @@ -0,0 +1,24 @@ + + +def create_hub_repo_name(root = "DeltaHub", + dataset = None, + delta_type = None, + model_name_or_path = None, + ): + r"""Currently, it's only a simple concatenation of the arguments. + """ + repo_name = [] + + repo_name.append(f"{delta_type}") + model_name_or_path = model_name_or_path.split("/")[-1] + repo_name.append(f"{model_name_or_path}") + repo_name.append(f"{dataset}") + + repo_name = "_".join(repo_name) + + repo_name = root+"/"+repo_name + return repo_name + + + + diff --git a/opendelta/utils/interactive/__init__.py b/opendelta/utils/interactive/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/opendelta/utils/interactive/templates/index.html b/opendelta/utils/interactive/templates/index.html new file mode 100644 index 0000000..bf6d411 --- /dev/null +++ b/opendelta/utils/interactive/templates/index.html @@ -0,0 +1,176 @@ +$def with (content) + + + + + + + + + + +$:content + + + + + diff --git a/opendelta/utils/interactive/web.py b/opendelta/utils/interactive/web.py new file mode 100644 index 0000000..a7da490 --- /dev/null +++ b/opendelta/utils/interactive/web.py @@ -0,0 +1,127 @@ +from opendelta.utils.visualization import Visualization +import web +import re, os + +space = " " +prefix0 = space * 9 +prefix1 = f"│"+space*5 +prefix2 = f"├─{space}" +prefix3 = f"└─{space}" + +def colorfy(label): + i = 0 + res = "" + while i < len(label): + if label[i] == '[': + color = "" + i += 1 + while label[i] != ']': + color += label[i] + i += 1 + i += 1 + if color[0].isdigit(): # dims but not color + res += f'[{color}]' + else: + if res != "": res += '' + res += f'' + else: + res += label[i] + i += 1 + res += '' + return res + +compressed_pattern_1 = re.compile("[0-9]+-[0-9]+") +compressed_pattern_2 = re.compile(".+(,.+)+") + +def expand_part(part): + res = [] + if compressed_pattern_1.fullmatch(part): + st, ed = map(int, part.split('-')) + for i in range(st, ed+1): + res.append( str(i) ) + elif compressed_pattern_2.fullmatch(part): + for c in part.split(','): + res.append( c ) + else: + res.append( part ) + return res + +def dfs(o, depth, last, old_name): + html = "" + module_names = expand_part(o.module_name) + if depth > 0: + old_last_1 = last[-1] + if len(module_names) > 1: + module_names = [o.module_name] + module_names + for ith, module_name in enumerate(module_names): + if ith == 0: + html += f'
' + elif ith == 1: + html += f'
' + + for i in range(depth-1): + html += prefix0 if last[i] else prefix1 + if depth > 0: + last[-1] = old_last_1 & (ith == 0 or ith == len(module_names)-1) + html += prefix3 if last[-1] else prefix2 + length = len(o.children) + if length > 0: + html += f'' + name = old_name + module_name + if ith > 0: + label = f'[red]{module_name}{o.label[o.label.index("[",1):]}' + else: + label = o.label + html += f'' + if len(module_names) > 1 and ith == 0: + html += '' + html += '
' + html += f'
' + for i, child in enumerate(o.children): + last = last + [i == length-1] + html += dfs(child, depth+1, last, name + ".") + last.pop() + + html += "
" + if ith == 0 or (ith > 1 and ith == len(module_names)-1): + html += "
" + return html + +urls = ( + '/submit/(.*)', 'submit', + '/(.*)', 'hello', +) + +class PortApplication(web.application): + def run(self, port=8080, *middleware): + func = self.wsgifunc(*middleware) + return web.httpserver.runsimple(func, ('0.0.0.0', port)) + +app = PortApplication(urls, globals()) +render = web.template.render(os.path.join(os.path.dirname(__file__), 'templates/')) +names = [] + +class hello: + def GET(self, name): + return render.index(content=html) +class submit: + def GET(self, _): + global names + names = [name.strip("root.") for name in web.input().name.split(";")] + app.stop() + +def interactive(model, port=8888): + tree = Visualization(model).structure_graph(printTree=False) + + global html + html = dfs(tree, 0, [], "") + + print() + print("If on your machine, open the link below for interactive modification.\n " + "If on remote host, you could use port mapping, " + "or run in vscode terminal, which automatically do port mapping for you.") + app.run() + global names + print("modified_modules:") + print(names) + return names diff --git a/opendelta/utils/logging.py b/opendelta/utils/logging.py new file mode 100644 index 0000000..727232d --- /dev/null +++ b/opendelta/utils/logging.py @@ -0,0 +1,278 @@ +# coding=utf-8 +# Copyright 2020 Optuna, Hugging Face +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# OpenDelta copied from Huggingface Transformers +""" Logging utilities.""" + +import logging +import os +import sys +import threading +from logging import CRITICAL # NOQA +from logging import DEBUG # NOQA +from logging import ERROR # NOQA +from logging import FATAL # NOQA +from logging import INFO # NOQA +from logging import NOTSET # NOQA +from logging import WARN # NOQA +from logging import WARNING # NOQA +from typing import Optional + + +_lock = threading.Lock() +_default_handler: Optional[logging.Handler] = None + +log_levels = { + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + "critical": logging.CRITICAL, +} + +_default_log_level = logging.INFO + + +def _get_default_logging_level(): + """ + If TRANSFORMERS_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is + not - fall back to ``_default_log_level`` + """ + env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None) + if env_level_str: + if env_level_str in log_levels: + return log_levels[env_level_str] + else: + logging.getLogger().warning( + f"Unknown option TRANSFORMERS_VERBOSITY={env_level_str}, " + f"has to be one of: { ', '.join(log_levels.keys()) }" + ) + return _default_log_level + + +def _get_library_name() -> str: + + return __name__.split(".")[0] + + +def _get_library_root_logger() -> logging.Logger: + + return logging.getLogger(_get_library_name()) + + +def _configure_library_root_logger() -> None: + + global _default_handler + + with _lock: + if _default_handler: + # This library has already configured the library root logger. + return + _default_handler = logging.StreamHandler() # Set sys.stderr as stream. + _default_handler.flush = sys.stderr.flush + formatter = logging.Formatter( + "[%(levelname)s|(OpenDelta)%(module)s:%(lineno)d]%(asctime)s >> %(message)s") + _default_handler.setFormatter(formatter) + + # Apply our default configuration to the library root logger. + library_root_logger = _get_library_root_logger() + library_root_logger.addHandler(_default_handler) + library_root_logger.setLevel(_get_default_logging_level()) + + + library_root_logger.propagate = False + + +def _reset_library_root_logger() -> None: + + global _default_handler + + with _lock: + if not _default_handler: + return + + library_root_logger = _get_library_root_logger() + library_root_logger.removeHandler(_default_handler) + library_root_logger.setLevel(logging.NOTSET) + _default_handler = None + + +def get_log_levels_dict(): + return log_levels + + +def get_logger(name: Optional[str] = None) -> logging.Logger: + """ + Return a logger with the specified name. + This function is not supposed to be directly accessed unless you are writing a custom transformers module. + """ + + if name is None: + name = _get_library_name() + + _configure_library_root_logger() + return logging.getLogger(name) + + +def get_verbosity() -> int: + """ + Return the current level for the 🤗 Transformers's root logger as an int. + Returns: + :obj:`int`: The logging level. + + 🤗 Transformers has following logging levels: + - 50: ``transformers.logging.CRITICAL`` or ``transformers.logging.FATAL`` + - 40: ``transformers.logging.ERROR`` + - 30: ``transformers.logging.WARNING`` or ``transformers.logging.WARN`` + - 20: ``transformers.logging.INFO`` + - 10: ``transformers.logging.DEBUG`` + """ + + _configure_library_root_logger() + return _get_library_root_logger().getEffectiveLevel() + + +def set_verbosity(verbosity: int) -> None: + """ + Set the verbosity level for the 🤗 Transformers's root logger. + Args: + verbosity (:obj:`int`): + Logging level, e.g., one of: + - ``transformers.logging.CRITICAL`` or ``transformers.logging.FATAL`` + - ``transformers.logging.ERROR`` + - ``transformers.logging.WARNING`` or ``transformers.logging.WARN`` + - ``transformers.logging.INFO`` + - ``transformers.logging.DEBUG`` + """ + + _configure_library_root_logger() + _get_library_root_logger().setLevel(verbosity) + + +def set_verbosity_info(): + """Set the verbosity to the ``INFO`` level.""" + return set_verbosity(INFO) + + +def set_verbosity_warning(): + """Set the verbosity to the ``WARNING`` level.""" + return set_verbosity(WARNING) + + +def set_verbosity_debug(): + """Set the verbosity to the ``DEBUG`` level.""" + return set_verbosity(DEBUG) + + +def set_verbosity_error(): + """Set the verbosity to the ``ERROR`` level.""" + return set_verbosity(ERROR) + + +def disable_default_handler() -> None: + """Disable the default handler of the HuggingFace Transformers's root logger.""" + + _configure_library_root_logger() + + assert _default_handler is not None + _get_library_root_logger().removeHandler(_default_handler) + + +def enable_default_handler() -> None: + """Enable the default handler of the HuggingFace Transformers's root logger.""" + + _configure_library_root_logger() + + assert _default_handler is not None + _get_library_root_logger().addHandler(_default_handler) + + +def add_handler(handler: logging.Handler) -> None: + """adds a handler to the HuggingFace Transformers's root logger.""" + + _configure_library_root_logger() + + assert handler is not None + _get_library_root_logger().addHandler(handler) + + +def remove_handler(handler: logging.Handler) -> None: + """removes given handler from the HuggingFace Transformers's root logger.""" + + _configure_library_root_logger() + + assert handler is not None and handler not in _get_library_root_logger().handlers + _get_library_root_logger().removeHandler(handler) + + +def disable_propagation() -> None: + """ + Disable propagation of the library log outputs. Note that log propagation is disabled by default. + """ + + _configure_library_root_logger() + _get_library_root_logger().propagate = False + + +def enable_propagation() -> None: + """ + Enable propagation of the library log outputs. Please disable the HuggingFace Transformers's default handler to + prevent double logging if the root logger has been configured. + """ + + _configure_library_root_logger() + _get_library_root_logger().propagate = True + + +def enable_explicit_format() -> None: + """ + Enable explicit formatting for every HuggingFace Transformers's logger. The explicit formatter is as follows: + ``` + [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE + ``` + All handlers currently bound to the root logger are affected by this method. + """ + handlers = _get_library_root_logger().handlers + + for handler in handlers: + formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s") + handler.setFormatter(formatter) + + +def reset_format() -> None: + """ + Resets the formatting for HuggingFace Transformers's loggers. + All handlers currently bound to the root logger are affected by this method. + """ + handlers = _get_library_root_logger().handlers + + for handler in handlers: + handler.setFormatter(None) + + +def warning_advice(self, *args, **kwargs): + """ + This method is identical to ``logger.warning()``, but if env var TRANSFORMERS_NO_ADVISORY_WARNINGS=1 is set, this + warning will not be printed + """ + no_advisory_warnings = os.getenv("TRANSFORMERS_NO_ADVISORY_WARNINGS", False) + if no_advisory_warnings: + return + self.warning(*args, **kwargs) + + +logging.Logger.warning_advice = warning_advice + +set_verbosity_debug() \ No newline at end of file diff --git a/opendelta/utils/model_md5.py b/opendelta/utils/model_md5.py new file mode 100644 index 0000000..3666295 --- /dev/null +++ b/opendelta/utils/model_md5.py @@ -0,0 +1,36 @@ +import hashlib + +def gen_model_hash(model, with_parameters=True): + r"""Get model hash (structure and parameter) + """ + str_model_structure = str(model).replace("\n","").replace(" ","").replace("\t","").encode('utf-8') + md5 = hashlib.md5(str_model_structure) + + if with_parameters: + md5 = gen_parameter_hash(model.parameters(), md5=md5) + + md5_code = md5.hexdigest() + return md5_code + + + +def gen_parameter_hash(generator, md5=None): + r"""Get parameter hash. From https://zhuanlan.zhihu.com/p/392942816 + + """ + if md5 is None: + md5 = hashlib.md5() + for arg in generator: + x = arg.data + if hasattr(x, "cpu"): + md5.update(x.cpu().numpy().data.tobytes()) + elif hasattr(x, "numpy"): + md5.update(x.numpy().data.tobytes()) + elif hasattr(x, "data"): + md5.update(x.data.tobytes()) + else: + try: + md5.update(x.encode("utf-8")) + except: + md5.update(str(x).encode("utf-8")) + return md5 \ No newline at end of file diff --git a/opendelta/utils/name_based_addressing.py b/opendelta/utils/name_based_addressing.py new file mode 100644 index 0000000..3de2142 --- /dev/null +++ b/opendelta/utils/name_based_addressing.py @@ -0,0 +1,71 @@ +from typing import List +import re +def superstring_in(str_a: str , list_b: List[str]): + r"""check whether there is any string in list b containing str_a. + + Args: + Returns: + """ + return any(str_a in str_b for str_b in list_b) + +def is_child_key(str_a: str , list_b: List[str]): + r"""check whether a string in ``list_b`` is the child key in ``str_a`` + + Args: + Returns: + """ + return any(str_b in str_a and (str_b==str_a or str_a[len(str_b)]==".") for str_b in list_b) + +def endswith_in(str_a: str , list_b: List[str]): + r"""check whether ``str_a`` has a substring that is in list_b. + + Args: + Returns: + """ + return any(str_a.endswith(str_b) and (str_a==str_b or str_a[-len(str_b)-1] == ".") for str_b in list_b) + +def substring_in(str_a: str , list_b: List[str]): + r"""check whether ``str_a`` has a substring that is in list_b. + + Args: + Returns: + """ + token_a = str_a.split(".") + for str_b in list_b: + token_b = str_b.split(".") + for i in range(len(token_a)-len(token_b)+1): + if "".join(token_a[i:i+len(token_b)]) == "".join(token_b): + return True + return False + +def endswith_in_regex(str_a: str , list_b: List[str]): + r"""check whether ``str_a`` has a substring that is in list_b. + + Args: + Returns: + """ + for str_b in list_b: + ret = re.search(str_b, str_a) + if ret is not None: + b = ret.group() + if ret.span()[1] == len(str_a) and (b == str_a or (str_a==b or str_a[-len(b)-1] == ".")): + # the latter is to judge whether it is a full sub key in the str_a, e.g. str_a=`attn.c_attn` and list_b=[`attn`] will given False + return True + return False + +def substring_in_regex(str_a: str , list_b: List[str]): + r"""check whether ``str_a`` has a substring that is in list_b. + + Args: + Returns: + """ + for str_b in list_b: + ret = re.search(str_b, str_a) + if ret is not None: + b = ret.group() + if (ret.span()[0] == 0 or str_a[ret.span()[0]-1] == ".") and \ + (ret.span()[1] == len(str_a) or str_a[ret.span()[1]] == "."): #and b == str_a and (str_a==b or str_a[-len(b)-1] == "."): + # the latter is to judge whether it is a full sub key in the str_a, e.g. str_a=`attn.c_attn` and list_b=[`attn`] will given False + return True + return False + \ No newline at end of file diff --git a/opendelta/utils/saving_loading_utils.py b/opendelta/utils/saving_loading_utils.py new file mode 100644 index 0000000..eaeac58 --- /dev/null +++ b/opendelta/utils/saving_loading_utils.py @@ -0,0 +1,396 @@ + +from io import RawIOBase +from tarfile import HeaderError +from typing import Union, Optional, Callable +from opendelta.delta_configs import BaseDeltaConfig +from opendelta.utils.model_md5 import gen_model_hash +import torch +import os +from opendelta import logging +import torch.nn as nn +from transformers.file_utils import ( + WEIGHTS_NAME, + PushToHubMixin, + is_offline_mode, + is_remote_url, + hf_bucket_url, + cached_path, + ) +from transformers.utils.dummy_pt_objects import PreTrainedModel +import hashlib + +logger = logging.get_logger(__name__) + +class SaveLoadMixin(PushToHubMixin): + def add_configs_when_saving(self,): + self.config.backbone_class = self.backbone_model.__class__.__name__ + self.config.backbone_checkpoint_name = os.path.split(self.backbone_model.config._name_or_path.strip("/"))[-1] + self.config.backbone_hash = gen_model_hash(self.backbone_model) + + + + + def save_finetuned( + self, + save_directory: Optional[Union[str, os.PathLike]] = "./output/", + save_config: bool = True, + state_dict: Optional[dict] = None, + save_function: Callable = torch.save, + push_to_hub: bool = False, + **kwargs, + ): + r""" + Save a model and its configuration file to a directory, so that it can be re-loaded using the + :py:meth:`~DeltaBase.from_finetuned` class method. + + Arguments: + save_directory (:obj:`str` or :obj:`os.PathLike`): + Directory to which to save. Will be created if it doesn't exist. + save_config (:obj:`bool`, *optional*, defaults to :obj:`True`): + Whether or not to save the config of the model. Useful when in distributed training like TPUs and need + to call this function on all processes. In this case, set ``save_config=True`` only on the main process + to avoid race conditions. + state_dict (nested dictionary of :obj:`torch.Tensor`): + The state dictionary of the model to save. Will default to ``self.state_dict()``, but can be used to only + save parts of the model or if special precautions need to be taken when recovering the state dictionary + of a model (like when using model parallelism). + save_function (:obj:`Callable`): + The function to use to save the state dictionary. Useful on distributed training like TPUs when one + need to replace ``torch.save`` by another method. + push_to_hub (:obj:`bool`, *optional*, defaults to :obj:`False`): + Whether or not to push your model to the HuggingFace model hub after saving it. + + .. tip:: + + Using ``push_to_hub=True`` will synchronize the repository you are pushing to with ``save_directory``, + which requires ``save_directory`` to be a local clone of the repo you are pushing to if it's an existing + folder. Pass along ``temp_dir=True`` to use a temporary directory instead. + + kwargs: + Additional key word arguments passed along to the :py:meth:`~file_utils.PushToHubMixin.push_to_hub` method. + + .. note:: + + You may need to install git-lfs on your machine. + + .. code-block:: bash + + wget -P ~ https://github.com/git-lfs/git-lfs/releases/download/v3.0.2/git-lfs-linux-amd64-v3.0.2.tar.gz + cd ~ + tar -xvzf git-lfs-linux-amd64-v3.0.2.tar.gz + export PATH=~:$PATH + git-lfs install + + """ + if os.path.isfile(save_directory): + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") + return + + if push_to_hub: + commit_message = kwargs.pop("commit_message", None) + repo = self._create_or_get_repo(save_directory, **kwargs) + + os.makedirs(save_directory, exist_ok=True) + + # Only save the model itself if we are using distributed training + + model_to_save = self.backbone_model# unwrap_model(self) + + # Save the model + if state_dict is None: + state_dict = model_to_save.state_dict() + + # Save the config + if save_config: + if not hasattr(self, "config"): + self.create_config_from_model() + self.add_configs_when_saving() + self.config.save_finetuned(save_directory) + + # If we save using the predefined names, we can load using `from_pretrained` + output_model_file = os.path.join(save_directory, WEIGHTS_NAME) + save_function(state_dict, output_model_file) + + logger.info(f"Model weights saved in {output_model_file}") + + if push_to_hub: + url = self._push_to_hub(repo, commit_message=commit_message) + logger.info(f"Model pushed to the hub in this commit: {url}") + + @classmethod + def from_finetuned(cls, + finetuned_model_name_or_path: Optional[Union[str, os.PathLike]], + backbone_model: nn.Module, + *model_args, + check_hash: Optional[bool] = True, + **kwargs): + r""" + Instantiate a finetuned delta model from a path. + The backbone_model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated). + To further train the model, you can use the :meth:`freeze_module ` method. + + Parameters: + + finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, *optional*): + Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a + user or organization name, like ``dbmdz/bert-base-german-cased``. + - A path to a *directory* containing model weights saved using + :meth:`SaveLoadMixin.save_finetuned`, e.g., ``./my_model_directory/``. + - A path or url to a *tensorflow index checkpoint file* (e.g, ``./tf_model/model.ckpt.index``). In + this case, ``from_tf`` should be set to ``True`` and a configuration object should be provided as + ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a + PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + - A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g, + ``./flax_model/`` containing ``flax_model.msgpack``). In this case, ``from_flax`` should be set to + ``True``. + - ``None`` if you are both providing the configuration and state dictionary (resp. with keyword + arguments ``config`` and ``state_dict``). + backbone_model (:obj:`torch.nn.Module`): The backbone model to be modified. + model_args (sequence of positional arguments, *optional*): + All remaining positional arguments will be passed to the underlying model's ``__init__`` method. + config (Union[:obj:`BaseDeltaConfig`, :obj:`str`, :obj:`os.PathLike`], *optional*): Can be either: + - an instance of a class derived from :class:`~PretrainedConfig`, + - a string or path valid as input to :py:meth:`~PretrainedConfig.from_pretrained`. + + Configuration for the model to use instead of an automatically loaded configuration. Configuration can + be automatically loaded when: + + - The model is a model provided by the library (loaded with the *model id* string of a pretrained + model). + - The model was saved using :py:meth:`~PreTrainedModel.save_pretrained` and is reloaded by supplying the + save directory. + - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a + configuration JSON file named *config.json* is found in the directory. + state_dict (Dict[:obj:`str`, :obj:`torch.Tensor`], *optional*): + A state dictionary to use instead of a state dictionary loaded from saved weights file. + This option can be used if you want to create a model from a pretrained configuration but load your own + weights. In this case though, you should check if using :py:meth:`~PreTrainedModel.save_pretrained` and + :py:meth:`~PreTrainedModel.from_pretrained` is not a simpler option. + cache_dir (:obj:`Union[str, os.PathLike]`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + force_download (:obj:`bool`, *optional*, defaults to :obj:`False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + resume_download (:obj:`bool`, *optional*, defaults to :obj:`False`): + Whether or not to delete incompletely received files. Will attempt to resume the download if such a + file exists. + proxies (:obj:`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + local_files_only(:obj:`bool`, *optional*, defaults to :obj:`False`): + Whether or not to only look at local files (i.e., do not try to download the model). + use_auth_token (:obj:`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token generated + when running ``transformers-cli login`` (stored in ``~/.huggingface``). + revision(:obj:`str`, *optional*, defaults to ``"main"``): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any + identifier allowed by git. + mirror(:obj:`str`, *optional*): + Mirror source to accelerate downloads in China. If you are from China and have an accessibility + problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. + Please refer to the mirror site for more information. + torch_dtype (:obj:`str` or :obj:`torch.dtype`, *optional*): + Override the default :obj:`torch.dtype` and load the model under this dtype. If ``"auto"`` is passed the dtype + will be automatically derived from the model's weights. + + .. warning:: + + This feature is inherited from HuggingFace. We do not guarantee its usefulness currently. + One should only disable *_fast_init* to ensure backwards compatibility with `transformers.__version__ < + 4.6.0` for seeded model initialization. This argument will be removed at the next major version. See + `pull request 11471 `_ for more information. + kwargs (remaining dictionary of keyword arguments, *optional*): + Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., + ``output_attentions=True``). Behaves differently depending on whether a ``config`` is provided or + automatically loaded: + + - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the + underlying model's ``__init__`` method (we assume all relevant updates to the configuration have + already been done) + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class + initialization function (:py:meth:`~PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that + corresponds to a configuration attribute will be used to override said attribute with the + supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute + will be passed to the underlying model's ``__init__`` function. + + .. tip:: + Passing ``use_auth_token=True`` is required when you want to use a private model. + + .. code-block:: python + + from transformers import AutoModelForSeq2SeqLM + t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-base") + from opendelta import AutoDeltaModel + delta = AutoDeltaModel.from_finetuned("DeltaHub/lora_t5-base_mrpc", backbone_model=t5) + delta.log() + + + + """ + config = kwargs.pop("config", None) + state_dict = kwargs.pop("state_dict", None) + cache_dir = kwargs.pop("cache_dir", None) + + # ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + # output_loading_info = kwargs.pop("output_loading_info", False) + local_files_only = kwargs.pop("local_files_only", False) + use_auth_token = kwargs.pop("use_auth_token", None) + revision = kwargs.pop("revision", None) + mirror = kwargs.pop("mirror", None) + from_pipeline = kwargs.pop("_from_pipeline", None) + from_auto_class = kwargs.pop("_from_auto", False) + # _fast_init = kwargs.pop("_fast_init", True) + torch_dtype = kwargs.pop("torch_dtype", None) + # low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False) + + user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class} + + if is_offline_mode() and not local_files_only: + logger.info("Offline mode: forcing local_files_only=True") + local_files_only = True + + # Load config if we don't provide a configuration + if not isinstance(config, BaseDeltaConfig): + config_path = config if config is not None else finetuned_model_name_or_path + config, model_kwargs = cls.config_class.from_finetuned( + config_path, + cache_dir=cache_dir, + return_unused_kwargs=True, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + revision=revision, + _from_auto=from_auto_class, + _from_pipeline=from_pipeline, + **kwargs, + ) + + else: + model_kwargs = kwargs + + # Load model + if finetuned_model_name_or_path is not None: + finetuned_model_name_or_path = str(finetuned_model_name_or_path) + if os.path.isdir(finetuned_model_name_or_path): + if os.path.isfile(os.path.join(finetuned_model_name_or_path, WEIGHTS_NAME)): + # Load from a PyTorch checkpoint + archive_file = os.path.join(finetuned_model_name_or_path, WEIGHTS_NAME) + else: + raise EnvironmentError( + f"Error no file named {WEIGHTS_NAME} found in " + f"directory {finetuned_model_name_or_path}." + ) + elif os.path.isfile(finetuned_model_name_or_path) or is_remote_url(finetuned_model_name_or_path): + archive_file = finetuned_model_name_or_path + else: + archive_file = hf_bucket_url( + finetuned_model_name_or_path, + filename=WEIGHTS_NAME, + revision=revision, + mirror=mirror, + ) + + try: + # Load from URL or cache if already cached #TODO + + resolved_archive_file = cached_path( + archive_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + user_agent=user_agent, + ) + except EnvironmentError as err: + logger.error(err) + msg = ( + f"Can't load weights for '{finetuned_model_name_or_path}'. Make sure that:\n\n" + ) + + if revision is not None: + msg += f"- or '{revision}' is a valid git identifier (branch name, a tag name, or a commit id) that exists for this model name as listed on its model page on 'https://huggingface.co/models'\n\n" + + raise EnvironmentError(msg) + + if resolved_archive_file == archive_file: + logger.info(f"loading weights file {archive_file}") + else: + logger.info(f"loading weights file {archive_file} from cache at {resolved_archive_file}") + else: + resolved_archive_file = None + + # load pt weights early so that we know which dtype to init the model under + + if state_dict is None: + try: + state_dict = torch.load(resolved_archive_file, map_location="cpu") + except Exception as e: + try: + with open(resolved_archive_file) as f: + if f.read().startswith("version"): + raise OSError( + "You seem to have cloned a repository without having git-lfs installed. Please install " + "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder " + "you cloned." + ) + else: + raise ValueError from e + except (UnicodeDecodeError, ValueError): + raise OSError( + f"Unable to load weights from pytorch checkpoint file for '{finetuned_model_name_or_path}' " + f"at '{resolved_archive_file}'. " + "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True." + ) + + # set dtype to instantiate the model under: + # 1. If torch_dtype is not None, we use that dtype + # 2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first + # weights entry - we assume all weights are of the same dtype + # we also may have config.torch_dtype available, but we won't rely on it till v5 + dtype_orig = None + if torch_dtype is not None: + if isinstance(torch_dtype, str): + if torch_dtype == "auto": + torch_dtype = next(iter(state_dict.values())).dtype + else: + raise ValueError( + f"`torch_dtype` can be either a `torch.dtype` or `auto`, but received {torch_dtype}" + ) + dtype_orig = cls._set_default_torch_dtype(torch_dtype) + + + # Initialize the model from config and attach the delta model to the backbone_model. + delta_model = cls.from_config(config, backbone_model, *model_args, **model_kwargs, ) + + # load the state_dict into the backbone_model. As the delta model's parameter + # is the same object as the deltas in the backbone model with different reference name, + # the state_dict will also be loaded into the delta model. + delta_model._load_state_dict_into_backbone(backbone_model, state_dict) + + backbone_hash = gen_model_hash(backbone_model) + if check_hash and hasattr(config, "backbone_hash") and \ + config.backbone_hash is not None and \ + config.backbone_hash != backbone_hash: + logger.warning("The config has an hash of the backbone model, and is" + "different from the hash of the loaded model. This indicates a mismatch" + "between the backbone model that the delta checkpoint is based on and" + "the one you loaded. You propobability need to Train the model instead of" + "directly inference. ") + + # Set model in evaluation mode to deactivate DropOut modules by default + backbone_model.eval() + + return delta_model + diff --git a/opendelta/utils/signature.py b/opendelta/utils/signature.py new file mode 100644 index 0000000..b559f92 --- /dev/null +++ b/opendelta/utils/signature.py @@ -0,0 +1,54 @@ +import inspect +from collections import namedtuple + +def signature(f): + r"""Get the function f 's input arguments. A useful gadget + when some function slot might be instantiated into multiple functions. + + Args: + f (:obj:`function`) : the function to get the input arguments. + + Returns: + namedtuple : of args, default, varargs, keywords, respectively.s + + """ + sig = inspect.signature(f) + args = [ + p.name for p in sig.parameters.values() + if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD + ] + varargs = [ + p.name for p in sig.parameters.values() + if p.kind == inspect.Parameter.VAR_POSITIONAL + ] + varargs = varargs[0] if varargs else None + keywords = [ + p.name for p in sig.parameters.values() + if p.kind == inspect.Parameter.VAR_KEYWORD + ] + keywords = keywords[0] if keywords else None + defaults = [ + p.default for p in sig.parameters.values() + if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD + and p.default is not p.empty + ] or None + argspec = namedtuple('Signature', ['args', 'defaults', + 'varargs', 'keywords']) + return argspec(args, defaults, varargs, keywords) + +def get_arg_names(f): + r""" Get a functions argument name, remove the ``self`` argument + """ + args = signature(f).args + if args[0] == "self": + args = args[1:] + return args + + +def get_arg_names_inside_func(func): + r""" Get the functions argument name inside the function itself. Remove ``self`` argument. + """ + arg_names = func.__code__.co_varnames[: func.__code__.co_argcount] + if arg_names[0] == "self": + arg_names = arg_names[1:] + return arg_names \ No newline at end of file diff --git a/opendelta/utils/structure_mapping.py b/opendelta/utils/structure_mapping.py new file mode 100644 index 0000000..bb1f1de --- /dev/null +++ b/opendelta/utils/structure_mapping.py @@ -0,0 +1,349 @@ +from typing import OrderedDict +import copy +import opendelta.utils.logging as logging +from opendelta.utils.visualization import Visualization +logger = logging.get_logger(__name__) +t5_mapping = { + "shared": {"__name__":"embeddings"}, + "encoder": {"__name__":"encoder", + "embed_tokens": {"__name__":"embeddings"}, + "block": {"__name__":"block", + "$": {"__name__":"$", + "layer.0": {"__name__":"attn", + "SelfAttention.q": {"__name__":"q"}, + "SelfAttention.k": {"__name__":"k"}, + "SelfAttention.v": {"__name__":"v"}, + "SelfAttention.o": {"__name__":"proj"}, + "SelfAttention.relative_attention_bias": {"__name__":""}, + "layer_norm": {"__name__":"layer_norm"}, + }, + "layer.1": {"__name__":"ff", + "DenseReluDense.wi": {"__name__":"w1"}, + "layer_norm": {"__name__":"layer_norm"}, + "DenseReluDense.wo": {"__name__":"w2"}, + } + } + }, + "final_layer_norm": {"__name__":"layer_norm"}, + }, + "decoder": {"__name__":"decoder", + "embed_tokens": {"__name__":"embeddings"}, + "block": {"__name__":"block", + "$": {"__name__":"$", + "layer.0": {"__name__":"attn", + "SelfAttention.q": {"__name__":"q"}, + "SelfAttention.k": {"__name__":"k"}, + "SelfAttention.v": {"__name__":"v"}, + "SelfAttention.o": {"__name__":"proj"}, + "SelfAttention.relative_attention_bias": {"__name__":""}, + "layer_norm": {"__name__":"layer_norm"}, + }, + "layer.1": {"__name__":"crossattn", + "EncDecAttention.q": {"__name__":"q"}, + "EncDecAttention.k": {"__name__":"k"}, + "EncDecAttention.v": {"__name__":"v"}, + "EncDecAttention.o": {"__name__":"proj"}, + "layer_norm": {"__name__":"layer_norm"}, + }, + "layer.2": {"__name__":"ff", + "DenseReluDense.wi": {"__name__":"w1"}, + "layer_norm": {"__name__":"layer_norm"}, + "DenseReluDense.wo": {"__name__":"w2"}, + } + } + }, + "final_layer_norm": {"__name__":"layer_norm"}, + } +} + + +roberta_mapping = { + "roberta.embeddings.word_embeddings": {"__name__":"embeddings"}, + "roberta.embeddings.position_embeddings": {"__name__":""}, + "roberta.embeddings.token_type_embeddings": {"__name__":""}, + "roberta.embeddings.LayerNorm": {"__name__":""}, + "roberta.encoder": {"__name__":"encoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "attention": {"__name__":"attn", + "self.query": {"__name__":"q"}, + "self.key": {"__name__":"k"}, + "self.value": {"__name__":"v"}, + "output.dense": {"__name__":"proj"}, + "output.LayerNorm": {"__name__":"layer_norm"}, + }, + "output": {"__name__":"ff", + "dense": {"__name__":"w2"}, + "LayerNorm": {"__name__":"layer_norm"} + }, + "intermediate.dense": {"__name__":"ff.w1"}, + } + } + }, + "lm_head": {"__name__":"lm_head", + "dense": {"__name__":""}, + "layer_norm": {"__name__":""}, + "decoder": {"__name__":"proj"}, + }, +} + + + +bert_mapping = { + "bert.embeddings.word_embeddings": {"__name__":"embeddings"}, + "bert.embeddings.position_embeddings": {"__name__":""}, + "bert.embeddings.token_type_embeddings": {"__name__":""}, + "bert.embeddings.LayerNorm": {"__name__":""}, + "bert.encoder": {"__name__":"encoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "attention": {"__name__":"attn", + "self.query": {"__name__":"q"}, + "self.key": {"__name__":"k"}, + "self.value": {"__name__":"v"}, + "output.dense": {"__name__":"proj"}, + "output.LayerNorm": {"__name__":"layer_norm"}, + }, + "output": {"__name__":"ff", + "dense": {"__name__":"w2"}, + "LayerNorm": {"__name__":"layer_norm"} + }, + "intermediate.dense": {"__name__":"ff.w1"}, + } + } + }, + "cls.predictions": {"__name__": "lm_head", + "transform.dense": {"__name__":""}, + "transform.LayerNorm": {"__name__":""}, + "decoder": {"__name__":"proj"}, + } +} + +debertav2_mapping = { + "deberta.embeddings.word_embeddings": {"__name__":"embeddings"}, + "deberta.embeddings.LayerNorm": {"__name__":""}, + "deberta.encoder": {"__name__":"encoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "attention": {"__name__":"attn", + "self.query_proj": {"__name__":"q"}, + "self.key_proj": {"__name__":"k"}, + "self.value_proj": {"__name__":"v"}, + "output.dense": {"__name__":"proj"}, + "output.LayerNorm": {"__name__":"layer_norm"}, + }, + "output": {"__name__":"ff", + "dense": {"__name__":"w2"}, + "LayerNorm": {"__name__":"layer_norm"} + }, + "intermediate.dense": {"__name__":"ff.w1"}, + } + }, + "rel_embeddings": {"__name__": ""}, + "LayerNorm": {"__name__": ""}, + "conv": {"__name__": "", + "conv": {"__name__": ""}, + "LayerNorm": {"__name__": ""} + } + }, + "lm_predictions.lm_head": {"__name__":"lm_head", + "dense": {"__name__":""}, + "LayerNorm": {"__name__":""}, + "bias": {"__name__": ""} + }, +} + +gpt2_mapping = { + "transformer.wte": {"__name__":"embeddings"}, + "transformer.wpe": {"__name__":""}, + "transformer.h": {"__name__":"decoder.block", + "$": {"__name__":"$", + "attn": {"__name__":"attn", + "c_attn": {"__name__":"q,k,v"}, + "c_proj": {"__name__":"proj"}, + }, + "ln_1": {"__name__":"attn.layer_norm"}, + "mlp":{ "__name__": "ff", + "c_fc": {"__name__":"w1"}, + "c_proj": {"__name__":"w2"} + }, + "ln_2": {"__name__":"ff.layer_norm"}, + }, + }, + "transformer.ln_f": {"__name__":"decoder.layernorm"}, + "lm_head": {"__name__":"lm_head.proj"}, +} + +distilbert_mapping = { + "distilbert.embeddings.word_embeddings": {"__name__":"embeddings"}, + "distilbert.embeddings.position_embeddings": {"__name__":""}, + "distilbert.embeddings.token_type_embeddings": {"__name__":""}, + "distilbert.embeddings.LayerNorm": {"__name__":""}, + "distilbert.transformer": {"__name__":"encoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "attention": {"__name__":"attn", + "q_lin": {"__name__":"q"}, + "k_lin": {"__name__":"k"}, + "v_lin": {"__name__":"v"}, + "out_lin": {"__name__":"proj"}, + }, + "ffn": {"__name__":"ff", + "lin1": {"__name__":"w1"}, + "lin2": {"__name__":"w2"}, + }, + "sa_layer_norm": {"__name__":"attn.layer_norm"}, + "output_layer_norm":{"__name__": "ff.layer_norm"} + } + } + } +} + +def transform(org_key, mapping, strict=True, warning=False, verbose=False): + + chain = org_key.split(".") + query = "" + node = mapping + + new_chain = [] + for elem in chain: + query += elem + if query in node: + node = node[query] + new_elem = node["__name__"] + if new_elem == "": + if strict: + if warning: + print(f"'{org_key}' has no common mapping.") + return + else: + new_chain.append(query) + else: + new_chain.append(new_elem) + query = "" + elif "$" in node: + node = node["$"] + new_chain.append(query) + query = "" + else: + query += "." + if query!="": + if strict: + if warning: + print("A part of the orginial key hasn't been matched!") + return + else: + new_chain.append(query.strip(".")) # tailing query + new_key = ".".join(new_chain) + if verbose: + print(f"{org_key} => {new_key}") + return new_key + + + + +def mapping_for_SequenceClassification(mapping, type): + mapping = copy.deepcopy(mapping) + if type == "roberta": + mapping.pop("lm_head") + mapping['classifier'] = {"__name__":"classifier", + "dense": {"__name__": "dense"}, + "out_proj": {"__name__":"out_proj"} + } + elif type == "bert": + mapping.pop("lm_head") + mapping["classifier"] = {"__name__": "classifier"} + elif type == "deberta": + mapping.pop("lm_predictions.lm_head") + mapping["pooler"] = {"__name__": "classifier"} + mapping["classifier"] = {"__name__": "classifier"} + else: + raise NotImplementedError + return mapping + +def mapping_for_ConditionalGeneration(mapping, type): + mapping = copy.deepcopy(mapping) + if type == "t5": + mapping["lm_head"] = {"__name__":"lm_head.proj"} + else: + raise NotImplementedError + return mapping + +class _LazyLoading(OrderedDict): + def __init__(self, mapping): + self._mapping_string = mapping + self._mapping = {} + + def __getitem__(self, key): + if key not in self._mapping_string: + raise KeyError(key) + value = self._mapping_string[key] + self._mapping[key] = eval(value) + return self._mapping[key] + + def keys(self): + return list(self._mapping_string.keys()) + + def __contains__(self, item): + + return item in self._mapping_string + + +class CommonStructureMap(object): + r""" A lazy loading structure map. + """ + Mappings = _LazyLoading({ + "RobertaForSequenceClassification": """mapping_for_SequenceClassification(roberta_mapping, "roberta")""", + "RobertaForMaskedLM": "roberta_mapping", + "BertForMaskedLM": "bert_mapping", + "BertForSequenceClassification": """mapping_for_SequenceClassification(bert_mapping, "bert")""", + "T5ForConditionalGeneration": """mapping_for_ConditionalGeneration(t5_mapping, "t5")""", + "DebertaV2ForSequenceClassification": """mapping_for_SequenceClassification(debertav2_mapping, "deberta")""" + }) + + SpecialModelInverseMaps = { + } + def __init__(self, mapping): + if not isinstance(mapping, dict): + raise TypeError(f"Initial a {CommonStructureMap.__name__} using a non-dict object. Consider using `load` instead.") + self.mapping = mapping + + + @classmethod + def load(cls, backbone_model, strict=True, warining=False, visualize=True): + r"""Doc + """ + backbone_class = type(backbone_model).__name__ + if backbone_class not in cls.Mappings: + raise KeyError(backbone_class) + mapping = cls.Mappings[backbone_class] + if visualize: + logger.info("Since you are using the common structure mapping, draw the transformed parameter structure for checking.") + vis = Visualization(backbone_model) + vis.structure_graph(common_structure=True, mapping=mapping) + return cls(mapping) + + def __repr__(self,): + return self.mapping + + + def transform(self, org_key, strict=True, warning=False): + return transform(org_key, self.mapping, strict, warning) + + + +if __name__ == "__main__": + from openprompt.plms import load_plm + import argparse + parser = argparse.ArgumentParser("") + parser.add_argument("--model", type=str, default='t5-lm', help="We test both t5 and t5-lm in this scripts, the corresponding tokenizerwrapper will be automatically loaded.") + parser.add_argument("--model_name_or_path", default="t5-base-lm-adapt") + parser.add_argument("--cache_base", default='/home/hushengding/plm_cache/') + parser.add_argument("--keep_non_params", action="store_true") + parser.add_argument("--expand_params", action="store_true") + args = parser.parse_args() + plm, tokenizer, model_config, WrapperClass = load_plm(args.model, args.cache_base+args.model_name_or_path) + + for name, _ in plm.named_modules(): + transform(name, t5_mapping, strict=True, warning=False) + \ No newline at end of file diff --git a/opendelta/utils/visualization.py b/opendelta/utils/visualization.py new file mode 100644 index 0000000..bbe5b12 --- /dev/null +++ b/opendelta/utils/visualization.py @@ -0,0 +1,438 @@ +from typing import List +from rich.tree import Tree as RichTree +from rich import print as richprint +import torch +import torch.nn as nn +import re +from collections import OrderedDict +import opendelta.utils.logging as logging +logger = logging.get_logger(__name__) +class ModuleTree(RichTree): + def __init__( + self, + module_name=None, + info=None, + is_param_node=False, + type_color="green", + param_color="red", + main_color="white", + style = "tree", + guide_style = "tree.line", + expanded=True, + highlight=False, + ): + self.module_name = module_name + self.info = info + self.is_param_node = is_param_node + self.type_color = type_color + self.param_color = param_color + self.main_color = main_color + label = self.set_label() + super().__init__(label,style=style,guide_style=guide_style,expanded=expanded,highlight=highlight) + + + def add( + self, + module_name=None, + info=None, + is_param_node=False, + type_color="green", + param_color="red", + main_color="white", + style=None, + guide_style=None, + expanded=True, + highlight=False, + ): + node = ModuleTree( + module_name, + info, + is_param_node, + type_color, + param_color, + main_color, + style=self.style if style is None else style, + guide_style=self.guide_style if guide_style is None else guide_style, + expanded=expanded, + highlight=self.highlight if highlight is None else highlight, + ) + self.children.append(node) + return node + + def set_label(self): + if self.module_name is not None: + label = f"[{self.main_color}]{self.module_name}" + else: + label = "" + if self.info is not None: + if not self.is_param_node: + label += f" [{self.type_color}]({self.info})" + else: + label += f" [{self.param_color}]{self.info}" + self.label = label + return label + + +class Visualization(object): + r""" + Better visualization tool for *BIG* pretrained models. + + - Better repeated block representation + - Clearer parameter position + - and Visible parameter state. + + Args: + plm (:obj:`torch.nn.Module`): The pretrained model, actually can be any pytorch module. + + """ + def __init__(self, plm: nn.Module): + + self.plm = plm + self.type_color = "green" + self.param_color = "cyan" + self.duplicate_color = "red" + self.normal_color = "white" + self.virtual_color = "orange" + self.not_common_color = "bright_black" + self.no_grad_color = "rgb(0,70,100)" + self.delta_color = "rgb(175,0,255)" + + def check_mode(self, ): + if self.keep_non_params and self.common_structure: + raise RuntimeError("keep_non_params can't be used will common_structure. The common structure only contains parameter nodes.") + if self.common_structure: + if self.mapping is None: + raise RuntimeError("Mapping hasn't been given.") + + def structure_graph(self, + rootname="root", + expand_params=False, + keep_non_params=False, + common_structure=False, + mapping=None, + only_common=False, + printTree=True, + ): + r"""Draw the structure graph in command line. + + Args: + rootname (:obj:`str`) The root node's name. + keep_non_params (:obj:`bool`) Display the modules that does not have parameters, such as nn.Dropout + expand_params (:obj:`bool`) Display parameter infomation (shape, etc) in seperate lines. " + common_structure (:obj:`bool`) Whether convert the structure into a common structure defined in structure_mapping.py. The not common structure will be displayed in grey. + only_common (:obj:`bool`) Whether ignore the modules that are not in common structure. This will result in a more compact view. Default to False. + mapping (:obj:`dict`) The structure mapping. Must provide if common_structure=True. + """ + + self.keep_non_params = keep_non_params + self.expand_params = expand_params + self.rootname = rootname + self.only_common = only_common + self.common_structure = common_structure + self.mapping = mapping + self.check_mode() + # root_tree = self.build_tree(rootname) + self.root_tree = ModuleTree(self.rootname) + if common_structure: + self.build_common_tree(self.plm, mapping, self.root_tree) + else: + self.build_tree(self.plm, self.root_tree) + self.prune_tree(self.root_tree) + if not self.expand_params: + self.fold_param_node(self.root_tree) + if printTree: + richprint(self.root_tree) + return self.root_tree + + + + + def is_leaf_module(self, module): + r"""[NODOC] Whether the module is a leaf module + """ + return len([n for n,_ in module.named_children()]) == 0 + + def build_tree(self, module:nn.Module, tree:ModuleTree=None): + r"""[NODOC] build the originial tree structure + """ + if self.is_leaf_module(module): + return + else: + for n,m in module.named_children(): + type_info = re.search(r'(?<=\').*(?=\')', str(type(m))).group() + type_info = type_info.split(".")[-1] + newnode = tree.add(n, info=type_info, type_color=self.type_color) + self.add_param_info_node(m, newnode) + self.build_tree(module=m, tree=newnode) + + def has_parameter(self, module): + return len([p for p in module.parameters()])>0 + + + def build_common_tree(self, module:nn.Module, mapping, tree:ModuleTree=None, query="", key_to_root=""): + r""" (Unstable) build the common tree structure + """ + if self.is_leaf_module(module): + if len(query)>0: # the field is not in mapping + if self.has_parameter(module): + # from IPython import embed + # embed(header = "in leaf") + logger.warning(f"Parameter node {query} not found under tree {tree.module_name} and module {module}. Is your mapping correct?") # WARNING + return + else: + for n,m in module.named_children(): + new_query = query+n + type_info = re.search(r'(?<=\').*(?=\')', str(type(m))).group() + type_info = type_info.split(".")[-1] + if new_query in mapping or "$" in mapping: + # print("query",new_query) + # from IPython import embed + # embed() + if new_query in mapping: + new_mapping = mapping[new_query] + name = new_mapping["__name__"] + if len(name.split(".")) > 1: # new key contains a hierarchy , then unfold the hierarchy. + # insert virtual node + hierachical_name = name.split(".") + temp_tree = self.find_or_insert(tree, hierachical_name) + newnode = temp_tree.add(hierachical_name[-1], info=type_info, type_color=self.type_color) + elif name=="": # the key not in a predefined common structure + if self.only_common: + continue + else: # add the originial name into the tree + newnode = tree.add(new_query, info=type_info, main_color=self.not_common_color, type_color=self.not_common_color) + else: # a single new key + newnode = self.find_not_insert(tree, [name,""]) # try to find the node + if newnode is not None: + newnode.info = type_info + newnode.type_color = self.type_color + newnode.set_label() + else: + newnode = tree.add(name, info=type_info, type_color=self.type_color) + elif "$" in mapping: # match any thing in the field. + new_mapping = mapping["$"] + newnode = tree.add(n, info=type_info, type_color=self.type_color) + self.add_param_info_node(m, newnode) + self.build_common_tree(module=m, tree=newnode, mapping=new_mapping, key_to_root=key_to_root+"."+new_query) + else: + # try to find from root + # trsf_key = transform(key_to_root.strip("."), self.mapping) + # parent_node = self.find_not_insert(self.root_tree, trsf_key.split(".")+[""]) + # if parent_node is not None: + # new_mapping = mapping[new_query] + # newnode = parent_node.add(name, info=type_info, type_color=self.type_color) + # self.build_common_tree(module=m, tree=parent_node, mapping ) + # print("notin query",new_query) + # if new_query == "dense": + # from IPython import embed + # embed() + # print(f"::{query},,{new_query}, {list(mapping.keys())}") + new_query += "." + self.build_common_tree(module=m, tree=tree, mapping=mapping, query=new_query, key_to_root=key_to_root) + + + + def find_or_insert(self, tree:ModuleTree, hierachical_name:List[str] ): + r"""[NODOC] Find the node, if not find, insert a virtual node + """ + if len(hierachical_name)==1: + return tree + names = [x.module_name for x in tree.children] + if hierachical_name[0] not in names: + new_node = tree.add(hierachical_name[0], info="Virtual", type_color=self.virtual_color) + else: + for x in tree.children: + if x.module_name == hierachical_name[0]: + new_node = x + break + return self.find_or_insert(new_node, hierachical_name=hierachical_name[1:]) + + def find_not_insert(self, tree:ModuleTree, hierachical_name:List[str] ): + r"""[NODOC] Find the node but not insert + """ + if len(hierachical_name)==1: + return tree + names = [x.module_name for x in tree.children] + if hierachical_name[0] not in names: + return None + else: + for x in tree.children: + if x.module_name == hierachical_name[0]: + new_node = x + break + return self.find_not_insert(new_node, hierachical_name=hierachical_name[1:]) + + + + def fold_param_node(self, t: ModuleTree, p:ModuleTree=None): + r"""[NODOC] place the parameters' infomation node right after the module that contains the parameters. + E.g. w1 (Linear) + -- weight: [32128, 1024] + => + w1 (Linear) weight: [32128, 1024] + + """ + if hasattr(t,"is_param_node") and t.is_param_node: + p.label += t.label + return True # indicate whether should be removed + elif len(t.children) == 0: + if self.keep_non_params: + return False + else: + return True + else: + rm_idx = [] + for idx, c in enumerate(t.children): + if self.fold_param_node(t=c, p=t): + rm_idx.append(idx) + t.children = [t.children[i] for i in range(len(t.children)) if i not in rm_idx] + return False + + def prune_tree(self, t: ModuleTree): + r"""[NODOC] Calculate the _finger_print of a module as the _finger_print of all child node plus the _finger_print of itself. + The leaf node will have the _finger_print == label. + Merge the different node that as the same _finger_print into a single node. + """ + if len(t.children) == 0: + setattr(t, "_finger_print", t.label) + return + + for idx, sub_tree in enumerate(t.children): + self.prune_tree(sub_tree) + + t_finger_print = t.label +"::"+";".join([x._finger_print for x in t.children]) + setattr(t, "_finger_print", t_finger_print) + + nohead_finger_print_dict = OrderedDict() + for child_id, sub_tree in enumerate(t.children): + fname_list = sub_tree._finger_print.split("::") + if len(fname_list)==1: + fname = fname_list[0] + else: + fname = "::".join(fname_list[1:]) + if fname not in nohead_finger_print_dict: + nohead_finger_print_dict[fname] = [child_id] + else: + nohead_finger_print_dict[fname].append(child_id) + + new_childrens = [] + for groupname in nohead_finger_print_dict: + representative_id = nohead_finger_print_dict[groupname][0] + representative = t.children[representative_id] + group_node = [t.children[idx] for idx in nohead_finger_print_dict[groupname]] + + representative = self.extract_common_and_join(group_node) + new_childrens.append(representative) + t.children = new_childrens + + + def extract_common_and_join(self, l:List[ModuleTree]): + r"""[NODOC] Some modules that have the same info (e.g., are all "Linear") have different names (e.g., w1,w2) + Merge them. + E.g. tree1.module_name = "w1", tree1.info = "Linear"; tree2.module_name = "w1", tree2.info = "Linear" + -> representive.module_name = "w1,w2", representive.info = "Linear" + """ + representative = l[0] + if len(l)==1: + return representative + name_list = [x.module_name for x in l] + info_list = [x.info for x in l] + type_hint_dict = OrderedDict() + for x, y in zip(name_list, info_list): + if y not in type_hint_dict: + type_hint_dict[y] = [x] + else: + type_hint_dict[y].append(x) + + s = "" + names = "" + typeinfos = "" + for t in type_hint_dict: + group_components = type_hint_dict[t] + group_components = self.neat_expr(group_components) + names += group_components+"," + typeinfos += t+"," + s += f"[{self.duplicate_color}]{group_components}[{self.type_color}]({t})" + s += f"," + names = names[:-1] + s = s[:-1] + typeinfos = typeinfos[:-1] + representative.module_name = names + representative.type_info = typeinfos + representative.label = s + return representative + + def neat_expr(self, l:List[str]): + r"""[NODOC] A small tool function to arrange the consecutive number into interval display. + E.g., ["1","2","3","5","6","9","10","11","12"] -> ["1-3","5-6","9-12"] + """ + try: + s = self.ranges([int(x.strip()) for x in l]) + s = [str(x)+"-"+str(y) for x,y in s] + return ",".join(s) + except: + return ",".join(l) + + def ranges(self, nums:List[int]): + r"""[NODOC] A small tool function to arrange the consecutive number into interval display. + E.g., [1,2,3,5,6,9,10,11,12] -> [[1,3],[5,6],[9,12]] + """ + nums = sorted(set(nums)) + gaps = [[s, e] for s, e in zip(nums, nums[1:]) if s+1 < e] + edges = iter(nums[:1] + sum(gaps, []) + nums[-1:]) + return list(zip(edges, edges)) + + def add_param_info_node(self, m:nn.Module, tree:ModuleTree, record_grad_state=True, record_delta=True): + r"""[NODOC] Add parameter infomation of the module. The parameters that are not inside a module (i.e., created using nn.Parameter) will be added in this function. + """ + known_module = [n for n,c in m.named_children()] + for n,p in m.named_parameters(): + if n.split(".")[0] not in known_module: + if len(n.split(".")) > 1: raise RuntimeError(f"The name field {n} should be a parameter since it doesn't appear in named_children, but it contains '.'") + info = "{}:{}".format(n, list(p.shape)) + + if record_grad_state: + if not p.requires_grad: + color = self.no_grad_color + else: + color = self.param_color + else: + color = self.param_color + + if record_delta: + if hasattr(p, "_is_delta") and getattr(p, "_is_delta"): + color = self.delta_color + + tree.add(info=info, is_param_node=True, param_color=color) + + + + + + + +if __name__=="__main__": + # example command line: + # 1. python opendelta/utils/visualization.py --model t5-lm --model_name_or_path t5-large-lm-adapt --common_structure --only_common + # 2. python opendelta/utils/visualization.py --model roberta --model_name_or_path roberta-large --common_structure + # 3. python opendelta/utils/visualization.py --model gpt2 --model_name_or_path gpt2-medium --keep_non_params --expand_params + from openprompt.plms import load_plm + import argparse + parser = argparse.ArgumentParser("") + parser.add_argument("--model", type=str, default='t5-lm', help="We test both t5 and t5-lm in this scripts, the corresponding tokenizerwrapper will be automatically loaded.") + parser.add_argument("--model_name_or_path", default="t5-large-lm-adapt") + parser.add_argument("--cache_base", default='/home/hushengding/plm_cache/') + parser.add_argument("--keep_non_params", action="store_true", help="Display the modules that does not have parameters, such as nn.Dropout") + parser.add_argument("--expand_params", action="store_true", help="Display parameter infomation (shape, etc) in seperate lines. ") + parser.add_argument("--common_structure", action="store_true", help="Whether convert the structure into a common structure defined in structure_mapping.py. The not common structure will be displayed in grey." ) + parser.add_argument("--only_common", action="store_true", help="Whether ignore the modules that are not in common structure. This will result in a more compact view. Default to False") + args = parser.parse_args() + plm, tokenizer, model_config, WrapperClass = load_plm(args.model, args.cache_base+args.model_name_or_path) + print("Model Loaded!") + if args.common_structure: + from opendelta.utils.structure_mapping import Mappings + mapping = Mappings[args.model] + else: + mapping = None + visobj = Visualization(plm) + visobj.structure_graph(rootname=args.model_name_or_path, keep_non_params=args.keep_non_params, expand_params=args.expand_params, common_structure=args.common_structure, only_common=args.only_common, mapping=mapping) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e5fa365 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +torch>=1.9.0 +transformers==4.10.0 +datasets==1.17.0 +sentencepiece==0.1.96 +tqdm==4.62.2 +openprompt +loralib +decorator +rich +web.py +gitpython diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..a10b156 --- /dev/null +++ b/setup.py @@ -0,0 +1,42 @@ + +import setuptools +import os + +def get_requirements(path): + ret = [] + with open(os.path.join(path, "requirements.txt"), encoding="utf-8") as freq: + for line in freq.readlines(): + ret.append( line.strip() ) + return ret + + +path = os.path.dirname(os.path.abspath(__file__)) +requires = get_requirements(path) +print(requires) + +with open('README.md', 'r') as f: + setuptools.setup( + name = 'opendelta', + version = '0.0.1', + description = "An open source framework for delta learning (parameter efficient learning).", + long_description=open("README.md", "r", encoding="utf-8").read(), + long_description_content_type="text/markdown", + author = '', + author_email = 'shengdinghu@gmail.com', + license="Apache", + url="https://github.com/thunlp/OpenDelta", + keywords = ['PLM', 'Parameter-efficient-Learning', 'AI', 'NLP'], + python_requires=">=3.8.0", + install_requires=requires, + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + ] + ) \ No newline at end of file