commit ccd9d2fe2d614fae2da127eee07751ff6593bf34 Author: Keerthana P G Date: Fri Dec 9 11:58:47 2022 -0800 add robotics transformer diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ac43932 --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +# Compiled python modules. +*.pyc + +# Byte-compiled +_pycache__/ +.cache/ + +# Poetry, setuptools, PyPI distribution artifacts. +/*.egg-info +.eggs/ +build/ +dist/ +poetry.lock + +# Tests +.pytest_cache/ + +# Type checking +.pytype/ + +# Other +*.DS_Store + +# PyCharm +.idea diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..467c1f9 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,447 @@ +# This Pylint rcfile contains a best-effort configuration to uphold the +# best-practices and style described in the Google Python style guide: +# https://google.github.io/styleguide/pyguide.html +# +# Its canonical open-source location is: +# https://google.github.io/styleguide/pylintrc + +[MASTER] + +# Add files or directories to the ignore list. They should be base names, not +# paths. +ignore=third_party + +# Add files or directories matching the regex patterns to the ignore list. The +# regex matches against base names, not paths. +ignore-patterns= + +# Pickle collected data for later comparisons. +persistent=no + +# List of plugins (as comma separated values of python modules names) to load, +# usually to register additional checkers. +load-plugins= + +# Use multiple processes to speed up Pylint. +jobs=4 + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +extension-pkg-allow-list= + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED +confidence= + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +#enable= + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once).You can also use "--disable=all" to +# disable everything first and then reenable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use"--disable=all --enable=classes +# --disable=W" +disable=abstract-method, + apply-builtin, + arguments-differ, + attribute-defined-outside-init, + backtick, + bad-option-value, + basestring-builtin, + buffer-builtin, + c-extension-no-member, + consider-using-enumerate, + cmp-builtin, + cmp-method, + coerce-builtin, + coerce-method, + delslice-method, + div-method, + duplicate-code, + eq-without-hash, + execfile-builtin, + file-builtin, + filter-builtin-not-iterating, + fixme, + getslice-method, + global-statement, + hex-method, + idiv-method, + implicit-str-concat-in-sequence, + import-error, + import-self, + import-star-module-level, + inconsistent-return-statements, + input-builtin, + intern-builtin, + invalid-str-codec, + locally-disabled, + long-builtin, + long-suffix, + map-builtin-not-iterating, + misplaced-comparison-constant, + missing-function-docstring, + metaclass-assignment, + next-method-called, + next-method-defined, + no-absolute-import, + no-else-break, + no-else-continue, + no-else-raise, + no-else-return, + no-init, # added + no-member, + no-name-in-module, + no-self-use, + nonzero-method, + oct-method, + old-division, + old-ne-operator, + old-octal-literal, + old-raise-syntax, + parameter-unpacking, + print-statement, + raising-string, + range-builtin-not-iterating, + raw_input-builtin, + rdiv-method, + reduce-builtin, + relative-import, + reload-builtin, + round-builtin, + setslice-method, + signature-differs, + standarderror-builtin, + suppressed-message, + sys-max-int, + too-few-public-methods, + too-many-ancestors, + too-many-arguments, + too-many-boolean-expressions, + too-many-branches, + too-many-instance-attributes, + too-many-locals, + too-many-nested-blocks, + too-many-public-methods, + too-many-return-statements, + too-many-statements, + trailing-newlines, + unichr-builtin, + unicode-builtin, + unnecessary-pass, + unpacking-in-except, + useless-else-on-loop, + useless-object-inheritance, + useless-suppression, + using-cmp-argument, + wrong-import-order, + xrange-builtin, + zip-builtin-not-iterating, + + +[REPORTS] + +# Set the output format. Available formats are text, parseable, colorized, msvs +# (visual studio) and html. You can also give a reporter class, eg +# mypackage.mymodule.MyReporterClass. +output-format=text + +# Put messages in a separate file for each module / package specified on the +# command line instead of printing them on stdout. Reports (if any) will be +# written in a file name "pylint_global.[txt|html]". This option is deprecated +# and it will be removed in Pylint 2.0. +files-output=no + +# Tells whether to display a full report or only the messages +reports=no + +# Python expression which should return a note less than 10 (10 is the highest +# note). You have access to the variables errors warning, statement which +# respectively contain the number of errors / warnings messages and the total +# number of statements analyzed. This is used by the global evaluation report +# (RP0004). +evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details +#msg-template= + + +[BASIC] + +# Good variable names which should always be accepted, separated by a comma +good-names=main,_ + +# Bad variable names which should always be refused, separated by a comma +bad-names= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Include a hint for the correct naming format with invalid-name +include-naming-hint=no + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl + +# Regular expression matching correct function names +function-rgx=^(?:(?PsetUp|tearDown|setUpModule|tearDownModule)|(?P_?[A-Z][a-zA-Z0-9]*)|(?P_?[a-z][a-z0-9_]*))$ + +# Regular expression matching correct variable names +variable-rgx=^[a-z][a-z0-9_]*$ + +# Regular expression matching correct constant names +const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ + +# Regular expression matching correct attribute names +attr-rgx=^_{0,2}[a-z][a-z0-9_]*$ + +# Regular expression matching correct argument names +argument-rgx=^[a-z][a-z0-9_]*$ + +# Regular expression matching correct class attribute names +class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ + +# Regular expression matching correct inline iteration names +inlinevar-rgx=^[a-z][a-z0-9_]*$ + +# Regular expression matching correct class names +class-rgx=^_?[A-Z][a-zA-Z0-9]*$ + +# Regular expression matching correct module names +module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$ + +# Regular expression matching correct method names +method-rgx=(?x)^(?:(?P_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P_{0,2}[a-z][a-z0-9_]*))$ + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$ + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=10 + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager + +# Tells whether missing members accessed in mixin class should be ignored. A +# mixin class is detected if its name ends with "mixin" (case insensitive). +ignore-mixin-members=yes + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis. It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + + +[FORMAT] + +# Maximum number of characters on a single line. +max-line-length=80 + +# TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt +# lines made too long by directives to pytype. + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=(?x)( + ^\s*(\#\ )??$| + ^\s*(from\s+\S+\s+)?import\s+.+$) + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=yes + +# List of optional constructs for which whitespace checking is disabled. `dict- +# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. +# `trailing-comma` allows a space between comma and closing bracket: (a, ). +# `empty-line` allows space-only lines. +no-space-check= + +# Maximum number of lines in a module +max-module-lines=99999 + +# String used as indentation unit. The internal Google style guide mandates 2 +# spaces. Google's externaly-published style guide says 4, consistent with +# PEP 8. Here, we use 2 spaces, for conformity with many open-sourced Google +# projects (like TensorFlow). +indent-string=' ' + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=TODO + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=yes + + +[VARIABLES] + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# A regular expression matching the name of dummy variables (i.e. expectedly +# not used). +dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_) + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid to define new builtins when possible. +additional-builtins= + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_,_cb + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools + + +[LOGGING] + +# Logging modules to check that the string format arguments are in logging +# function parameter format +logging-modules=logging,absl.logging,tensorflow.io.logging + + +[SIMILARITIES] + +# Minimum lines number of a similarity. +min-similarity-lines=4 + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + +# Ignore imports when computing similarities. +ignore-imports=no + + +[SPELLING] + +# Spelling dictionary name. Available dictionaries: none. To make it working +# install python-enchant package. +spelling-dict= + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to indicated private dictionary in +# --spelling-private-dict-file option instead of raising a message. +spelling-store-unknown-words=no + + +[IMPORTS] + +# Deprecated modules which should not be used, separated by a comma +deprecated-modules=regsub, + TERMIOS, + Bastion, + rexec, + sets + +# Create a graph of every (i.e. internal and external) dependencies in the +# given file (report RP0402 must not be disabled) +import-graph= + +# Create a graph of external dependencies in the given file (report RP0402 must +# not be disabled) +ext-import-graph= + +# Create a graph of internal dependencies in the given file (report RP0402 must +# not be disabled) +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant, absl + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + + +[CLASSES] + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls, + class_ + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=mcs + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when being caught. Defaults to +# "Exception" +overgeneral-exceptions=StandardError, + Exception, + BaseException diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..2130cbf --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,30 @@ +{ + "files.insertFinalNewline": true, + "files.trimFinalNewlines": true, + "files.trimTrailingWhitespace": true, + "files.associations": { + ".pylintrc": "ini" + }, + "python.testing.unittestEnabled": false, + "python.testing.nosetestsEnabled": false, + "python.testing.pytestEnabled": true, + "python.linting.pylintUseMinimalCheckers": false, + "[python]": { + "editor.rulers": [ + 80 + ], + "editor.tabSize": 2, + "editor.formatOnSave": true, + "editor.detectIndentation": false + }, + "python.formatting.provider": "black", + "python.formatting.blackPath": "pyink", + "files.watcherExclude": { + "**/.git/**": true + }, + "files.exclude": { + "**/__pycache__": true, + "**/.pytest_cache": true, + "**/*.egg-info": true + } +} diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..711e72d --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,31 @@ +# Changelog + + + +## [Unreleased] + +## [0.1.0] - 2022-01-01 + +* Initial release + +[Unreleased]: https://github.com/google-research/robotics_transformer/compare/v0.1.0...HEAD +[0.1.0]: https://github.com/google-research/robotics_transformer/releases/tag/v0.1.0 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..e5c3b28 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,29 @@ +# How to Contribute + +We'd love to accept your patches and contributions to this project. There are +just a few small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement (CLA). You (or your employer) retain the copyright to your +contribution; this simply gives us permission to use and redistribute your +contributions as part of the project. Head over to + to see your current agreements on file or +to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code Reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. + +## Community Guidelines + +This project follows +[Google's Open Source Community Guidelines](https://opensource.google/conduct/). diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7a4a3ea --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/METADATA b/METADATA new file mode 100644 index 0000000..2571adf --- /dev/null +++ b/METADATA @@ -0,0 +1,134 @@ +# go/google3metadata +# proto-file: devtools/metadata/metadata.proto +# proto-message: MetaData + +name: "robotics_transformer" +description: "code and utilities to build and run RT-1 robotics transformer" + +third_party { + url { + type: HOMEPAGE + value: "http://go/robotics_transformer" + } + url { + type: PIPER + value: "http://google3/third_party/py/robotics_transformer" + } +} + +presubmit: { + review_notify: "robotics_transformer-automated+reviews" + + check_tests: { + failure_status: ERROR + project: "robotics_transformer" + } + + # Checks that files in the changelist do not contain tab characters. + check_tabs: { + failure_status: ERROR + } + + check_trailing_whitespace: { + failure_status: ERROR + } + + # Presubmit applied during submit + + check_lint: { + action: SUBMIT # Do not activate by default to not block TAP. + failure_status: ERROR + } + + # Ensures that the string "do not submit" (in all caps) is not present. + check_do_not_submit: { + action: SUBMIT + } +} + +# Register the copy.bara.sky +exported: { + copybara: { + config_path: "//depot/google3/third_party/py/robotics_transformer/copy.bara.sky" + } + path_expression: "//depot/google3/third_party/py/robotics_transformer/..." + remote_location: "https://github.com/google-research/robotics_transformer" + reason: OPEN_SOURCE + description: "Open source robotics_transformer" + # request_url: "https://launch.corp.google.com/launch/4225970" + owning_team_email: "robotics_transformer-automated@google.com" +} + +# Copybara presubmit +# presubmit: { +# path_expression: "//depot/google3/third_party/py/robotics_transformer/..." +# # Do not trigger copybara for the following files +# path_expression_exclusion: "//depot/.../METADATA" +# path_expression_exclusion: "//depot/.../OWNERS" +# path_expression_exclusion: "//depot/.../BUILD" +# path_expression_exclusion: "//depot/.../*.bzl" +# path_expression_exclusion: "//depot/.../google/..." + +# # Ensure that changes contain public notes for git commit messages. +# check_description: { +# base: { +# id: "CopybaraDescription" +# disable_tags: "GIT_ORIGIN_REV_ID" +# disable_tags: "SKIP_COPYBARA" +# } + +# required_regexp: +# "(" +# "(^|\\n)\\s*BEGIN_PUBLIC\\s*?\\n" +# "(.*\\n)*" +# "\\s*\\S+.*(\\n.*)*\\n" +# "\\s*END_PUBLIC\\s*?\\n" +# "|" +# "(^|\\n)\\s*PUBLIC:(?: )*\\S+" +# ")" + +# failure_message: +# "\n" +# "By running presubmit, this cl will be exported as PR on github. " +# "Please add a public commit message to the cl description:\n" +# "\n" +# "PUBLIC: my public commit msg\n" +# "\n" +# "OR\n" +# "\n" +# "BEGIN_PUBLIC\n" +# "my public\n" +# "commit msg\n" +# "END_PUBLIC\n" +# "\n" +# "If you're certain your change does not produce public changes, the\n" +# "message can say 'Internal'.\n" +# failure_status: WARNING +# required_for_cleanup: false +# } + +# check_presubmit_service: { +# base: { id: "Copybara-Review" disable_tags: "GIT_ORIGIN_REV_ID" } +# action: REVIEW +# streaming: true +# timeout: 60 +# failure_status: WARNING +# execution_mode: SECONDARY_EXECUTION +# include_all_opened_files: true +# include_deleted_files: true +# address: "blade:copybara-streaming-presubmit-service-prod" +# options: "depot_path=//depot/google3/third_party/py/robotics_transformer/copy.bara.sky;workflow=piper_to_github_presubmit;blocking=false" +# } +# check_presubmit_service: { +# base: { id: "Copybara-Submit" disable_tags: "GIT_ORIGIN_REV_ID" } +# action: SUBMIT +# streaming: true +# timeout: 600 +# failure_status: ERROR +# execution_mode: SECONDARY_EXECUTION +# include_all_opened_files: true +# include_deleted_files: true +# address: "blade:copybara-streaming-presubmit-service-prod" +# options: "depot_path=//depot/google3/third_party/py/robotics_transformer/copy.bara.sky;workflow=piper_to_github_presubmit;blocking=true" +# } +# } diff --git a/OWNERS b/OWNERS new file mode 100644 index 0000000..741ab64 --- /dev/null +++ b/OWNERS @@ -0,0 +1,2 @@ +keerthanapg +yaolug diff --git a/README.md b/README.md new file mode 100644 index 0000000..08e8e5b --- /dev/null +++ b/README.md @@ -0,0 +1,45 @@ +# Robotics Transformer + +*This is not an officially supported Google product.* + + +This repository is a collection code files and artifacts for running +Robotics Transformer or RT-1. + +## Features + +* Film efficient net based image tokenizer backbone +* Token learner based compression of input tokens +* Transformer for end to end robotic control +* Testing utilities + +## Getting Started + +### Installation +Clone the repo +```bash +git clone https://github.com/google-research/robotics_transformer.git +pip install -r robotics_transformer/requirements.txt +python -m robotics_transformer.tokenizers.action_tokenizer.test +``` + +### Running Tests + +To run RT-1 tests, you can clone the git repo and run +[bazel](https://bazel.build/): + +```bash +git clone https://github.com/google_research/robotics_transformer.git +cd robotics_transformer +bazel test ... +``` + +## Future Releases + +The current repository includes an initial set of libraries for early adoption. +More components may come in future releases. + +## License + +The Robotics Transformer library is licensed under the terms of the Apache +license. diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..da89942 --- /dev/null +++ b/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""robotics_transformer API.""" + +# A new PyPI release will be pushed everytime `__version__` is increased +# When changing this, also update the CHANGELOG.md +__version__ = '0.1.0' diff --git a/configs/transformer_mixin.gin b/configs/transformer_mixin.gin new file mode 100644 index 0000000..0c9db1b --- /dev/null +++ b/configs/transformer_mixin.gin @@ -0,0 +1,27 @@ +from __gin__ import dynamic_registration +from robotics_transformer import transformer_network +from robotics_transformer.tokenizers import image_tokenizer +import tensorflow as tf + +LEARNING_RATE_ACTOR = 0.0001 +SEQUENCE_LENGTH = 6 + + +transformer_network.TransformerNetwork: + num_layers = 8 + layer_size = 128 + num_heads = 8 + feed_forward_size = 512 + dropout_rate = 0.1 + vocab_size = 256 + token_embedding_size = 512 + time_sequence_length = %SEQUENCE_LENGTH + crop_size = %CROP_SIZE + action_order = %ACTION_ORDER + use_token_learner = True + +actor_optimizer/tf.keras.optimizers.Adam: + learning_rate = %LEARNING_RATE_ACTOR + +ACTOR_NETWORK = @transformer_network.TransformerNetwork +ACTOR_OPTIMIZER = @actor_optimizer/tf.keras.optimizers.Adam() diff --git a/film_efficientnet/__init__.py b/film_efficientnet/__init__.py new file mode 100644 index 0000000..6d6d126 --- /dev/null +++ b/film_efficientnet/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/film_efficientnet/efficientnet_checkpoints/efficientnetb3.h5 b/film_efficientnet/efficientnet_checkpoints/efficientnetb3.h5 new file mode 100644 index 0000000..cdf5cd4 Binary files /dev/null and b/film_efficientnet/efficientnet_checkpoints/efficientnetb3.h5 differ diff --git a/film_efficientnet/efficientnet_checkpoints/efficientnetb3_notop.h5 b/film_efficientnet/efficientnet_checkpoints/efficientnetb3_notop.h5 new file mode 100644 index 0000000..021cc82 Binary files /dev/null and b/film_efficientnet/efficientnet_checkpoints/efficientnetb3_notop.h5 differ diff --git a/film_efficientnet/efficientnet_checkpoints/imagenet_classes.json b/film_efficientnet/efficientnet_checkpoints/imagenet_classes.json new file mode 100644 index 0000000..2ebd296 --- /dev/null +++ b/film_efficientnet/efficientnet_checkpoints/imagenet_classes.json @@ -0,0 +1 @@ +{"0": ["n01440764", "tench"], "1": ["n01443537", "goldfish"], "2": ["n01484850", "great_white_shark"], "3": ["n01491361", "tiger_shark"], "4": ["n01494475", "hammerhead"], "5": ["n01496331", "electric_ray"], "6": ["n01498041", "stingray"], "7": ["n01514668", "cock"], "8": ["n01514859", "hen"], "9": ["n01518878", "ostrich"], "10": ["n01530575", "brambling"], "11": ["n01531178", "goldfinch"], "12": ["n01532829", "house_finch"], "13": ["n01534433", "junco"], "14": ["n01537544", "indigo_bunting"], "15": ["n01558993", "robin"], "16": ["n01560419", "bulbul"], "17": ["n01580077", "jay"], "18": ["n01582220", "magpie"], "19": ["n01592084", "chickadee"], "20": ["n01601694", "water_ouzel"], "21": ["n01608432", "kite"], "22": ["n01614925", "bald_eagle"], "23": ["n01616318", "vulture"], "24": ["n01622779", "great_grey_owl"], "25": ["n01629819", "European_fire_salamander"], "26": ["n01630670", "common_newt"], "27": ["n01631663", "eft"], "28": ["n01632458", "spotted_salamander"], "29": ["n01632777", "axolotl"], "30": ["n01641577", "bullfrog"], "31": ["n01644373", "tree_frog"], "32": ["n01644900", "tailed_frog"], "33": ["n01664065", "loggerhead"], "34": ["n01665541", "leatherback_turtle"], "35": ["n01667114", "mud_turtle"], "36": ["n01667778", "terrapin"], "37": ["n01669191", "box_turtle"], "38": ["n01675722", "banded_gecko"], "39": ["n01677366", "common_iguana"], "40": ["n01682714", "American_chameleon"], "41": ["n01685808", "whiptail"], "42": ["n01687978", "agama"], "43": ["n01688243", "frilled_lizard"], "44": ["n01689811", "alligator_lizard"], "45": ["n01692333", "Gila_monster"], "46": ["n01693334", "green_lizard"], "47": ["n01694178", "African_chameleon"], "48": ["n01695060", "Komodo_dragon"], "49": ["n01697457", "African_crocodile"], "50": ["n01698640", "American_alligator"], "51": ["n01704323", "triceratops"], "52": ["n01728572", "thunder_snake"], "53": ["n01728920", "ringneck_snake"], "54": ["n01729322", "hognose_snake"], "55": ["n01729977", "green_snake"], "56": ["n01734418", "king_snake"], "57": ["n01735189", "garter_snake"], "58": ["n01737021", "water_snake"], "59": ["n01739381", "vine_snake"], "60": ["n01740131", "night_snake"], "61": ["n01742172", "boa_constrictor"], "62": ["n01744401", "rock_python"], "63": ["n01748264", "Indian_cobra"], "64": ["n01749939", "green_mamba"], "65": ["n01751748", "sea_snake"], "66": ["n01753488", "horned_viper"], "67": ["n01755581", "diamondback"], "68": ["n01756291", "sidewinder"], "69": ["n01768244", "trilobite"], "70": ["n01770081", "harvestman"], "71": ["n01770393", "scorpion"], "72": ["n01773157", "black_and_gold_garden_spider"], "73": ["n01773549", "barn_spider"], "74": ["n01773797", "garden_spider"], "75": ["n01774384", "black_widow"], "76": ["n01774750", "tarantula"], "77": ["n01775062", "wolf_spider"], "78": ["n01776313", "tick"], "79": ["n01784675", "centipede"], "80": ["n01795545", "black_grouse"], "81": ["n01796340", "ptarmigan"], "82": ["n01797886", "ruffed_grouse"], "83": ["n01798484", "prairie_chicken"], "84": ["n01806143", "peacock"], "85": ["n01806567", "quail"], "86": ["n01807496", "partridge"], "87": ["n01817953", "African_grey"], "88": ["n01818515", "macaw"], "89": ["n01819313", "sulphur-crested_cockatoo"], "90": ["n01820546", "lorikeet"], "91": ["n01824575", "coucal"], "92": ["n01828970", "bee_eater"], "93": ["n01829413", "hornbill"], "94": ["n01833805", "hummingbird"], "95": ["n01843065", "jacamar"], "96": ["n01843383", "toucan"], "97": ["n01847000", "drake"], "98": ["n01855032", "red-breasted_merganser"], "99": ["n01855672", "goose"], "100": ["n01860187", "black_swan"], "101": ["n01871265", "tusker"], "102": ["n01872401", "echidna"], "103": ["n01873310", "platypus"], "104": ["n01877812", "wallaby"], "105": ["n01882714", "koala"], "106": ["n01883070", "wombat"], "107": ["n01910747", "jellyfish"], "108": ["n01914609", "sea_anemone"], "109": ["n01917289", "brain_coral"], "110": ["n01924916", "flatworm"], "111": ["n01930112", "nematode"], "112": ["n01943899", "conch"], "113": ["n01944390", "snail"], "114": ["n01945685", "slug"], "115": ["n01950731", "sea_slug"], "116": ["n01955084", "chiton"], "117": ["n01968897", "chambered_nautilus"], "118": ["n01978287", "Dungeness_crab"], "119": ["n01978455", "rock_crab"], "120": ["n01980166", "fiddler_crab"], "121": ["n01981276", "king_crab"], "122": ["n01983481", "American_lobster"], "123": ["n01984695", "spiny_lobster"], "124": ["n01985128", "crayfish"], "125": ["n01986214", "hermit_crab"], "126": ["n01990800", "isopod"], "127": ["n02002556", "white_stork"], "128": ["n02002724", "black_stork"], "129": ["n02006656", "spoonbill"], "130": ["n02007558", "flamingo"], "131": ["n02009229", "little_blue_heron"], "132": ["n02009912", "American_egret"], "133": ["n02011460", "bittern"], "134": ["n02012849", "crane"], "135": ["n02013706", "limpkin"], "136": ["n02017213", "European_gallinule"], "137": ["n02018207", "American_coot"], "138": ["n02018795", "bustard"], "139": ["n02025239", "ruddy_turnstone"], "140": ["n02027492", "red-backed_sandpiper"], "141": ["n02028035", "redshank"], "142": ["n02033041", "dowitcher"], "143": ["n02037110", "oystercatcher"], "144": ["n02051845", "pelican"], "145": ["n02056570", "king_penguin"], "146": ["n02058221", "albatross"], "147": ["n02066245", "grey_whale"], "148": ["n02071294", "killer_whale"], "149": ["n02074367", "dugong"], "150": ["n02077923", "sea_lion"], "151": ["n02085620", "Chihuahua"], "152": ["n02085782", "Japanese_spaniel"], "153": ["n02085936", "Maltese_dog"], "154": ["n02086079", "Pekinese"], "155": ["n02086240", "Shih-Tzu"], "156": ["n02086646", "Blenheim_spaniel"], "157": ["n02086910", "papillon"], "158": ["n02087046", "toy_terrier"], "159": ["n02087394", "Rhodesian_ridgeback"], "160": ["n02088094", "Afghan_hound"], "161": ["n02088238", "basset"], "162": ["n02088364", "beagle"], "163": ["n02088466", "bloodhound"], "164": ["n02088632", "bluetick"], "165": ["n02089078", "black-and-tan_coonhound"], "166": ["n02089867", "Walker_hound"], "167": ["n02089973", "English_foxhound"], "168": ["n02090379", "redbone"], "169": ["n02090622", "borzoi"], "170": ["n02090721", "Irish_wolfhound"], "171": ["n02091032", "Italian_greyhound"], "172": ["n02091134", "whippet"], "173": ["n02091244", "Ibizan_hound"], "174": ["n02091467", "Norwegian_elkhound"], "175": ["n02091635", "otterhound"], "176": ["n02091831", "Saluki"], "177": ["n02092002", "Scottish_deerhound"], "178": ["n02092339", "Weimaraner"], "179": ["n02093256", "Staffordshire_bullterrier"], "180": ["n02093428", "American_Staffordshire_terrier"], "181": ["n02093647", "Bedlington_terrier"], "182": ["n02093754", "Border_terrier"], "183": ["n02093859", "Kerry_blue_terrier"], "184": ["n02093991", "Irish_terrier"], "185": ["n02094114", "Norfolk_terrier"], "186": ["n02094258", "Norwich_terrier"], "187": ["n02094433", "Yorkshire_terrier"], "188": ["n02095314", "wire-haired_fox_terrier"], "189": ["n02095570", "Lakeland_terrier"], "190": ["n02095889", "Sealyham_terrier"], "191": ["n02096051", "Airedale"], "192": ["n02096177", "cairn"], "193": ["n02096294", "Australian_terrier"], "194": ["n02096437", "Dandie_Dinmont"], "195": ["n02096585", "Boston_bull"], "196": ["n02097047", "miniature_schnauzer"], "197": ["n02097130", "giant_schnauzer"], "198": ["n02097209", "standard_schnauzer"], "199": ["n02097298", "Scotch_terrier"], "200": ["n02097474", "Tibetan_terrier"], "201": ["n02097658", "silky_terrier"], "202": ["n02098105", "soft-coated_wheaten_terrier"], "203": ["n02098286", "West_Highland_white_terrier"], "204": ["n02098413", "Lhasa"], "205": ["n02099267", "flat-coated_retriever"], "206": ["n02099429", "curly-coated_retriever"], "207": ["n02099601", "golden_retriever"], "208": ["n02099712", "Labrador_retriever"], "209": ["n02099849", "Chesapeake_Bay_retriever"], "210": ["n02100236", "German_short-haired_pointer"], "211": ["n02100583", "vizsla"], "212": ["n02100735", "English_setter"], "213": ["n02100877", "Irish_setter"], "214": ["n02101006", "Gordon_setter"], "215": ["n02101388", "Brittany_spaniel"], "216": ["n02101556", "clumber"], "217": ["n02102040", "English_springer"], "218": ["n02102177", "Welsh_springer_spaniel"], "219": ["n02102318", "cocker_spaniel"], "220": ["n02102480", "Sussex_spaniel"], "221": ["n02102973", "Irish_water_spaniel"], "222": ["n02104029", "kuvasz"], "223": ["n02104365", "schipperke"], "224": ["n02105056", "groenendael"], "225": ["n02105162", "malinois"], "226": ["n02105251", "briard"], "227": ["n02105412", "kelpie"], "228": ["n02105505", "komondor"], "229": ["n02105641", "Old_English_sheepdog"], "230": ["n02105855", "Shetland_sheepdog"], "231": ["n02106030", "collie"], "232": ["n02106166", "Border_collie"], "233": ["n02106382", "Bouvier_des_Flandres"], "234": ["n02106550", "Rottweiler"], "235": ["n02106662", "German_shepherd"], "236": ["n02107142", "Doberman"], "237": ["n02107312", "miniature_pinscher"], "238": ["n02107574", "Greater_Swiss_Mountain_dog"], "239": ["n02107683", "Bernese_mountain_dog"], "240": ["n02107908", "Appenzeller"], "241": ["n02108000", "EntleBucher"], "242": ["n02108089", "boxer"], "243": ["n02108422", "bull_mastiff"], "244": ["n02108551", "Tibetan_mastiff"], "245": ["n02108915", "French_bulldog"], "246": ["n02109047", "Great_Dane"], "247": ["n02109525", "Saint_Bernard"], "248": ["n02109961", "Eskimo_dog"], "249": ["n02110063", "malamute"], "250": ["n02110185", "Siberian_husky"], "251": ["n02110341", "dalmatian"], "252": ["n02110627", "affenpinscher"], "253": ["n02110806", "basenji"], "254": ["n02110958", "pug"], "255": ["n02111129", "Leonberg"], "256": ["n02111277", "Newfoundland"], "257": ["n02111500", "Great_Pyrenees"], "258": ["n02111889", "Samoyed"], "259": ["n02112018", "Pomeranian"], "260": ["n02112137", "chow"], "261": ["n02112350", "keeshond"], "262": ["n02112706", "Brabancon_griffon"], "263": ["n02113023", "Pembroke"], "264": ["n02113186", "Cardigan"], "265": ["n02113624", "toy_poodle"], "266": ["n02113712", "miniature_poodle"], "267": ["n02113799", "standard_poodle"], "268": ["n02113978", "Mexican_hairless"], "269": ["n02114367", "timber_wolf"], "270": ["n02114548", "white_wolf"], "271": ["n02114712", "red_wolf"], "272": ["n02114855", "coyote"], "273": ["n02115641", "dingo"], "274": ["n02115913", "dhole"], "275": ["n02116738", "African_hunting_dog"], "276": ["n02117135", "hyena"], "277": ["n02119022", "red_fox"], "278": ["n02119789", "kit_fox"], "279": ["n02120079", "Arctic_fox"], "280": ["n02120505", "grey_fox"], "281": ["n02123045", "tabby"], "282": ["n02123159", "tiger_cat"], "283": ["n02123394", "Persian_cat"], "284": ["n02123597", "Siamese_cat"], "285": ["n02124075", "Egyptian_cat"], "286": ["n02125311", "cougar"], "287": ["n02127052", "lynx"], "288": ["n02128385", "leopard"], "289": ["n02128757", "snow_leopard"], "290": ["n02128925", "jaguar"], "291": ["n02129165", "lion"], "292": ["n02129604", "tiger"], "293": ["n02130308", "cheetah"], "294": ["n02132136", "brown_bear"], "295": ["n02133161", "American_black_bear"], "296": ["n02134084", "ice_bear"], "297": ["n02134418", "sloth_bear"], "298": ["n02137549", "mongoose"], "299": ["n02138441", "meerkat"], "300": ["n02165105", "tiger_beetle"], "301": ["n02165456", "ladybug"], "302": ["n02167151", "ground_beetle"], "303": ["n02168699", "long-horned_beetle"], "304": ["n02169497", "leaf_beetle"], "305": ["n02172182", "dung_beetle"], "306": ["n02174001", "rhinoceros_beetle"], "307": ["n02177972", "weevil"], "308": ["n02190166", "fly"], "309": ["n02206856", "bee"], "310": ["n02219486", "ant"], "311": ["n02226429", "grasshopper"], "312": ["n02229544", "cricket"], "313": ["n02231487", "walking_stick"], "314": ["n02233338", "cockroach"], "315": ["n02236044", "mantis"], "316": ["n02256656", "cicada"], "317": ["n02259212", "leafhopper"], "318": ["n02264363", "lacewing"], "319": ["n02268443", "dragonfly"], "320": ["n02268853", "damselfly"], "321": ["n02276258", "admiral"], "322": ["n02277742", "ringlet"], "323": ["n02279972", "monarch"], "324": ["n02280649", "cabbage_butterfly"], "325": ["n02281406", "sulphur_butterfly"], "326": ["n02281787", "lycaenid"], "327": ["n02317335", "starfish"], "328": ["n02319095", "sea_urchin"], "329": ["n02321529", "sea_cucumber"], "330": ["n02325366", "wood_rabbit"], "331": ["n02326432", "hare"], "332": ["n02328150", "Angora"], "333": ["n02342885", "hamster"], "334": ["n02346627", "porcupine"], "335": ["n02356798", "fox_squirrel"], "336": ["n02361337", "marmot"], "337": ["n02363005", "beaver"], "338": ["n02364673", "guinea_pig"], "339": ["n02389026", "sorrel"], "340": ["n02391049", "zebra"], "341": ["n02395406", "hog"], "342": ["n02396427", "wild_boar"], "343": ["n02397096", "warthog"], "344": ["n02398521", "hippopotamus"], "345": ["n02403003", "ox"], "346": ["n02408429", "water_buffalo"], "347": ["n02410509", "bison"], "348": ["n02412080", "ram"], "349": ["n02415577", "bighorn"], "350": ["n02417914", "ibex"], "351": ["n02422106", "hartebeest"], "352": ["n02422699", "impala"], "353": ["n02423022", "gazelle"], "354": ["n02437312", "Arabian_camel"], "355": ["n02437616", "llama"], "356": ["n02441942", "weasel"], "357": ["n02442845", "mink"], "358": ["n02443114", "polecat"], "359": ["n02443484", "black-footed_ferret"], "360": ["n02444819", "otter"], "361": ["n02445715", "skunk"], "362": ["n02447366", "badger"], "363": ["n02454379", "armadillo"], "364": ["n02457408", "three-toed_sloth"], "365": ["n02480495", "orangutan"], "366": ["n02480855", "gorilla"], "367": ["n02481823", "chimpanzee"], "368": ["n02483362", "gibbon"], "369": ["n02483708", "siamang"], "370": ["n02484975", "guenon"], "371": ["n02486261", "patas"], "372": ["n02486410", "baboon"], "373": ["n02487347", "macaque"], "374": ["n02488291", "langur"], "375": ["n02488702", "colobus"], "376": ["n02489166", "proboscis_monkey"], "377": ["n02490219", "marmoset"], "378": ["n02492035", "capuchin"], "379": ["n02492660", "howler_monkey"], "380": ["n02493509", "titi"], "381": ["n02493793", "spider_monkey"], "382": ["n02494079", "squirrel_monkey"], "383": ["n02497673", "Madagascar_cat"], "384": ["n02500267", "indri"], "385": ["n02504013", "Indian_elephant"], "386": ["n02504458", "African_elephant"], "387": ["n02509815", "lesser_panda"], "388": ["n02510455", "giant_panda"], "389": ["n02514041", "barracouta"], "390": ["n02526121", "eel"], "391": ["n02536864", "coho"], "392": ["n02606052", "rock_beauty"], "393": ["n02607072", "anemone_fish"], "394": ["n02640242", "sturgeon"], "395": ["n02641379", "gar"], "396": ["n02643566", "lionfish"], "397": ["n02655020", "puffer"], "398": ["n02666196", "abacus"], "399": ["n02667093", "abaya"], "400": ["n02669723", "academic_gown"], "401": ["n02672831", "accordion"], "402": ["n02676566", "acoustic_guitar"], "403": ["n02687172", "aircraft_carrier"], "404": ["n02690373", "airliner"], "405": ["n02692877", "airship"], "406": ["n02699494", "altar"], "407": ["n02701002", "ambulance"], "408": ["n02704792", "amphibian"], "409": ["n02708093", "analog_clock"], "410": ["n02727426", "apiary"], "411": ["n02730930", "apron"], "412": ["n02747177", "ashcan"], "413": ["n02749479", "assault_rifle"], "414": ["n02769748", "backpack"], "415": ["n02776631", "bakery"], "416": ["n02777292", "balance_beam"], "417": ["n02782093", "balloon"], "418": ["n02783161", "ballpoint"], "419": ["n02786058", "Band_Aid"], "420": ["n02787622", "banjo"], "421": ["n02788148", "bannister"], "422": ["n02790996", "barbell"], "423": ["n02791124", "barber_chair"], "424": ["n02791270", "barbershop"], "425": ["n02793495", "barn"], "426": ["n02794156", "barometer"], "427": ["n02795169", "barrel"], "428": ["n02797295", "barrow"], "429": ["n02799071", "baseball"], "430": ["n02802426", "basketball"], "431": ["n02804414", "bassinet"], "432": ["n02804610", "bassoon"], "433": ["n02807133", "bathing_cap"], "434": ["n02808304", "bath_towel"], "435": ["n02808440", "bathtub"], "436": ["n02814533", "beach_wagon"], "437": ["n02814860", "beacon"], "438": ["n02815834", "beaker"], "439": ["n02817516", "bearskin"], "440": ["n02823428", "beer_bottle"], "441": ["n02823750", "beer_glass"], "442": ["n02825657", "bell_cote"], "443": ["n02834397", "bib"], "444": ["n02835271", "bicycle-built-for-two"], "445": ["n02837789", "bikini"], "446": ["n02840245", "binder"], "447": ["n02841315", "binoculars"], "448": ["n02843684", "birdhouse"], "449": ["n02859443", "boathouse"], "450": ["n02860847", "bobsled"], "451": ["n02865351", "bolo_tie"], "452": ["n02869837", "bonnet"], "453": ["n02870880", "bookcase"], "454": ["n02871525", "bookshop"], "455": ["n02877765", "bottlecap"], "456": ["n02879718", "bow"], "457": ["n02883205", "bow_tie"], "458": ["n02892201", "brass"], "459": ["n02892767", "brassiere"], "460": ["n02894605", "breakwater"], "461": ["n02895154", "breastplate"], "462": ["n02906734", "broom"], "463": ["n02909870", "bucket"], "464": ["n02910353", "buckle"], "465": ["n02916936", "bulletproof_vest"], "466": ["n02917067", "bullet_train"], "467": ["n02927161", "butcher_shop"], "468": ["n02930766", "cab"], "469": ["n02939185", "caldron"], "470": ["n02948072", "candle"], "471": ["n02950826", "cannon"], "472": ["n02951358", "canoe"], "473": ["n02951585", "can_opener"], "474": ["n02963159", "cardigan"], "475": ["n02965783", "car_mirror"], "476": ["n02966193", "carousel"], "477": ["n02966687", "carpenter's_kit"], "478": ["n02971356", "carton"], "479": ["n02974003", "car_wheel"], "480": ["n02977058", "cash_machine"], "481": ["n02978881", "cassette"], "482": ["n02979186", "cassette_player"], "483": ["n02980441", "castle"], "484": ["n02981792", "catamaran"], "485": ["n02988304", "CD_player"], "486": ["n02992211", "cello"], "487": ["n02992529", "cellular_telephone"], "488": ["n02999410", "chain"], "489": ["n03000134", "chainlink_fence"], "490": ["n03000247", "chain_mail"], "491": ["n03000684", "chain_saw"], "492": ["n03014705", "chest"], "493": ["n03016953", "chiffonier"], "494": ["n03017168", "chime"], "495": ["n03018349", "china_cabinet"], "496": ["n03026506", "Christmas_stocking"], "497": ["n03028079", "church"], "498": ["n03032252", "cinema"], "499": ["n03041632", "cleaver"], "500": ["n03042490", "cliff_dwelling"], "501": ["n03045698", "cloak"], "502": ["n03047690", "clog"], "503": ["n03062245", "cocktail_shaker"], "504": ["n03063599", "coffee_mug"], "505": ["n03063689", "coffeepot"], "506": ["n03065424", "coil"], "507": ["n03075370", "combination_lock"], "508": ["n03085013", "computer_keyboard"], "509": ["n03089624", "confectionery"], "510": ["n03095699", "container_ship"], "511": ["n03100240", "convertible"], "512": ["n03109150", "corkscrew"], "513": ["n03110669", "cornet"], "514": ["n03124043", "cowboy_boot"], "515": ["n03124170", "cowboy_hat"], "516": ["n03125729", "cradle"], "517": ["n03126707", "crane"], "518": ["n03127747", "crash_helmet"], "519": ["n03127925", "crate"], "520": ["n03131574", "crib"], "521": ["n03133878", "Crock_Pot"], "522": ["n03134739", "croquet_ball"], "523": ["n03141823", "crutch"], "524": ["n03146219", "cuirass"], "525": ["n03160309", "dam"], "526": ["n03179701", "desk"], "527": ["n03180011", "desktop_computer"], "528": ["n03187595", "dial_telephone"], "529": ["n03188531", "diaper"], "530": ["n03196217", "digital_clock"], "531": ["n03197337", "digital_watch"], "532": ["n03201208", "dining_table"], "533": ["n03207743", "dishrag"], "534": ["n03207941", "dishwasher"], "535": ["n03208938", "disk_brake"], "536": ["n03216828", "dock"], "537": ["n03218198", "dogsled"], "538": ["n03220513", "dome"], "539": ["n03223299", "doormat"], "540": ["n03240683", "drilling_platform"], "541": ["n03249569", "drum"], "542": ["n03250847", "drumstick"], "543": ["n03255030", "dumbbell"], "544": ["n03259280", "Dutch_oven"], "545": ["n03271574", "electric_fan"], "546": ["n03272010", "electric_guitar"], "547": ["n03272562", "electric_locomotive"], "548": ["n03290653", "entertainment_center"], "549": ["n03291819", "envelope"], "550": ["n03297495", "espresso_maker"], "551": ["n03314780", "face_powder"], "552": ["n03325584", "feather_boa"], "553": ["n03337140", "file"], "554": ["n03344393", "fireboat"], "555": ["n03345487", "fire_engine"], "556": ["n03347037", "fire_screen"], "557": ["n03355925", "flagpole"], "558": ["n03372029", "flute"], "559": ["n03376595", "folding_chair"], "560": ["n03379051", "football_helmet"], "561": ["n03384352", "forklift"], "562": ["n03388043", "fountain"], "563": ["n03388183", "fountain_pen"], "564": ["n03388549", "four-poster"], "565": ["n03393912", "freight_car"], "566": ["n03394916", "French_horn"], "567": ["n03400231", "frying_pan"], "568": ["n03404251", "fur_coat"], "569": ["n03417042", "garbage_truck"], "570": ["n03424325", "gasmask"], "571": ["n03425413", "gas_pump"], "572": ["n03443371", "goblet"], "573": ["n03444034", "go-kart"], "574": ["n03445777", "golf_ball"], "575": ["n03445924", "golfcart"], "576": ["n03447447", "gondola"], "577": ["n03447721", "gong"], "578": ["n03450230", "gown"], "579": ["n03452741", "grand_piano"], "580": ["n03457902", "greenhouse"], "581": ["n03459775", "grille"], "582": ["n03461385", "grocery_store"], "583": ["n03467068", "guillotine"], "584": ["n03476684", "hair_slide"], "585": ["n03476991", "hair_spray"], "586": ["n03478589", "half_track"], "587": ["n03481172", "hammer"], "588": ["n03482405", "hamper"], "589": ["n03483316", "hand_blower"], "590": ["n03485407", "hand-held_computer"], "591": ["n03485794", "handkerchief"], "592": ["n03492542", "hard_disc"], "593": ["n03494278", "harmonica"], "594": ["n03495258", "harp"], "595": ["n03496892", "harvester"], "596": ["n03498962", "hatchet"], "597": ["n03527444", "holster"], "598": ["n03529860", "home_theater"], "599": ["n03530642", "honeycomb"], "600": ["n03532672", "hook"], "601": ["n03534580", "hoopskirt"], "602": ["n03535780", "horizontal_bar"], "603": ["n03538406", "horse_cart"], "604": ["n03544143", "hourglass"], "605": ["n03584254", "iPod"], "606": ["n03584829", "iron"], "607": ["n03590841", "jack-o'-lantern"], "608": ["n03594734", "jean"], "609": ["n03594945", "jeep"], "610": ["n03595614", "jersey"], "611": ["n03598930", "jigsaw_puzzle"], "612": ["n03599486", "jinrikisha"], "613": ["n03602883", "joystick"], "614": ["n03617480", "kimono"], "615": ["n03623198", "knee_pad"], "616": ["n03627232", "knot"], "617": ["n03630383", "lab_coat"], "618": ["n03633091", "ladle"], "619": ["n03637318", "lampshade"], "620": ["n03642806", "laptop"], "621": ["n03649909", "lawn_mower"], "622": ["n03657121", "lens_cap"], "623": ["n03658185", "letter_opener"], "624": ["n03661043", "library"], "625": ["n03662601", "lifeboat"], "626": ["n03666591", "lighter"], "627": ["n03670208", "limousine"], "628": ["n03673027", "liner"], "629": ["n03676483", "lipstick"], "630": ["n03680355", "Loafer"], "631": ["n03690938", "lotion"], "632": ["n03691459", "loudspeaker"], "633": ["n03692522", "loupe"], "634": ["n03697007", "lumbermill"], "635": ["n03706229", "magnetic_compass"], "636": ["n03709823", "mailbag"], "637": ["n03710193", "mailbox"], "638": ["n03710637", "maillot"], "639": ["n03710721", "maillot"], "640": ["n03717622", "manhole_cover"], "641": ["n03720891", "maraca"], "642": ["n03721384", "marimba"], "643": ["n03724870", "mask"], "644": ["n03729826", "matchstick"], "645": ["n03733131", "maypole"], "646": ["n03733281", "maze"], "647": ["n03733805", "measuring_cup"], "648": ["n03742115", "medicine_chest"], "649": ["n03743016", "megalith"], "650": ["n03759954", "microphone"], "651": ["n03761084", "microwave"], "652": ["n03763968", "military_uniform"], "653": ["n03764736", "milk_can"], "654": ["n03769881", "minibus"], "655": ["n03770439", "miniskirt"], "656": ["n03770679", "minivan"], "657": ["n03773504", "missile"], "658": ["n03775071", "mitten"], "659": ["n03775546", "mixing_bowl"], "660": ["n03776460", "mobile_home"], "661": ["n03777568", "Model_T"], "662": ["n03777754", "modem"], "663": ["n03781244", "monastery"], "664": ["n03782006", "monitor"], "665": ["n03785016", "moped"], "666": ["n03786901", "mortar"], "667": ["n03787032", "mortarboard"], "668": ["n03788195", "mosque"], "669": ["n03788365", "mosquito_net"], "670": ["n03791053", "motor_scooter"], "671": ["n03792782", "mountain_bike"], "672": ["n03792972", "mountain_tent"], "673": ["n03793489", "mouse"], "674": ["n03794056", "mousetrap"], "675": ["n03796401", "moving_van"], "676": ["n03803284", "muzzle"], "677": ["n03804744", "nail"], "678": ["n03814639", "neck_brace"], "679": ["n03814906", "necklace"], "680": ["n03825788", "nipple"], "681": ["n03832673", "notebook"], "682": ["n03837869", "obelisk"], "683": ["n03838899", "oboe"], "684": ["n03840681", "ocarina"], "685": ["n03841143", "odometer"], "686": ["n03843555", "oil_filter"], "687": ["n03854065", "organ"], "688": ["n03857828", "oscilloscope"], "689": ["n03866082", "overskirt"], "690": ["n03868242", "oxcart"], "691": ["n03868863", "oxygen_mask"], "692": ["n03871628", "packet"], "693": ["n03873416", "paddle"], "694": ["n03874293", "paddlewheel"], "695": ["n03874599", "padlock"], "696": ["n03876231", "paintbrush"], "697": ["n03877472", "pajama"], "698": ["n03877845", "palace"], "699": ["n03884397", "panpipe"], "700": ["n03887697", "paper_towel"], "701": ["n03888257", "parachute"], "702": ["n03888605", "parallel_bars"], "703": ["n03891251", "park_bench"], "704": ["n03891332", "parking_meter"], "705": ["n03895866", "passenger_car"], "706": ["n03899768", "patio"], "707": ["n03902125", "pay-phone"], "708": ["n03903868", "pedestal"], "709": ["n03908618", "pencil_box"], "710": ["n03908714", "pencil_sharpener"], "711": ["n03916031", "perfume"], "712": ["n03920288", "Petri_dish"], "713": ["n03924679", "photocopier"], "714": ["n03929660", "pick"], "715": ["n03929855", "pickelhaube"], "716": ["n03930313", "picket_fence"], "717": ["n03930630", "pickup"], "718": ["n03933933", "pier"], "719": ["n03935335", "piggy_bank"], "720": ["n03937543", "pill_bottle"], "721": ["n03938244", "pillow"], "722": ["n03942813", "ping-pong_ball"], "723": ["n03944341", "pinwheel"], "724": ["n03947888", "pirate"], "725": ["n03950228", "pitcher"], "726": ["n03954731", "plane"], "727": ["n03956157", "planetarium"], "728": ["n03958227", "plastic_bag"], "729": ["n03961711", "plate_rack"], "730": ["n03967562", "plow"], "731": ["n03970156", "plunger"], "732": ["n03976467", "Polaroid_camera"], "733": ["n03976657", "pole"], "734": ["n03977966", "police_van"], "735": ["n03980874", "poncho"], "736": ["n03982430", "pool_table"], "737": ["n03983396", "pop_bottle"], "738": ["n03991062", "pot"], "739": ["n03992509", "potter's_wheel"], "740": ["n03995372", "power_drill"], "741": ["n03998194", "prayer_rug"], "742": ["n04004767", "printer"], "743": ["n04005630", "prison"], "744": ["n04008634", "projectile"], "745": ["n04009552", "projector"], "746": ["n04019541", "puck"], "747": ["n04023962", "punching_bag"], "748": ["n04026417", "purse"], "749": ["n04033901", "quill"], "750": ["n04033995", "quilt"], "751": ["n04037443", "racer"], "752": ["n04039381", "racket"], "753": ["n04040759", "radiator"], "754": ["n04041544", "radio"], "755": ["n04044716", "radio_telescope"], "756": ["n04049303", "rain_barrel"], "757": ["n04065272", "recreational_vehicle"], "758": ["n04067472", "reel"], "759": ["n04069434", "reflex_camera"], "760": ["n04070727", "refrigerator"], "761": ["n04074963", "remote_control"], "762": ["n04081281", "restaurant"], "763": ["n04086273", "revolver"], "764": ["n04090263", "rifle"], "765": ["n04099969", "rocking_chair"], "766": ["n04111531", "rotisserie"], "767": ["n04116512", "rubber_eraser"], "768": ["n04118538", "rugby_ball"], "769": ["n04118776", "rule"], "770": ["n04120489", "running_shoe"], "771": ["n04125021", "safe"], "772": ["n04127249", "safety_pin"], "773": ["n04131690", "saltshaker"], "774": ["n04133789", "sandal"], "775": ["n04136333", "sarong"], "776": ["n04141076", "sax"], "777": ["n04141327", "scabbard"], "778": ["n04141975", "scale"], "779": ["n04146614", "school_bus"], "780": ["n04147183", "schooner"], "781": ["n04149813", "scoreboard"], "782": ["n04152593", "screen"], "783": ["n04153751", "screw"], "784": ["n04154565", "screwdriver"], "785": ["n04162706", "seat_belt"], "786": ["n04179913", "sewing_machine"], "787": ["n04192698", "shield"], "788": ["n04200800", "shoe_shop"], "789": ["n04201297", "shoji"], "790": ["n04204238", "shopping_basket"], "791": ["n04204347", "shopping_cart"], "792": ["n04208210", "shovel"], "793": ["n04209133", "shower_cap"], "794": ["n04209239", "shower_curtain"], "795": ["n04228054", "ski"], "796": ["n04229816", "ski_mask"], "797": ["n04235860", "sleeping_bag"], "798": ["n04238763", "slide_rule"], "799": ["n04239074", "sliding_door"], "800": ["n04243546", "slot"], "801": ["n04251144", "snorkel"], "802": ["n04252077", "snowmobile"], "803": ["n04252225", "snowplow"], "804": ["n04254120", "soap_dispenser"], "805": ["n04254680", "soccer_ball"], "806": ["n04254777", "sock"], "807": ["n04258138", "solar_dish"], "808": ["n04259630", "sombrero"], "809": ["n04263257", "soup_bowl"], "810": ["n04264628", "space_bar"], "811": ["n04265275", "space_heater"], "812": ["n04266014", "space_shuttle"], "813": ["n04270147", "spatula"], "814": ["n04273569", "speedboat"], "815": ["n04275548", "spider_web"], "816": ["n04277352", "spindle"], "817": ["n04285008", "sports_car"], "818": ["n04286575", "spotlight"], "819": ["n04296562", "stage"], "820": ["n04310018", "steam_locomotive"], "821": ["n04311004", "steel_arch_bridge"], "822": ["n04311174", "steel_drum"], "823": ["n04317175", "stethoscope"], "824": ["n04325704", "stole"], "825": ["n04326547", "stone_wall"], "826": ["n04328186", "stopwatch"], "827": ["n04330267", "stove"], "828": ["n04332243", "strainer"], "829": ["n04335435", "streetcar"], "830": ["n04336792", "stretcher"], "831": ["n04344873", "studio_couch"], "832": ["n04346328", "stupa"], "833": ["n04347754", "submarine"], "834": ["n04350905", "suit"], "835": ["n04355338", "sundial"], "836": ["n04355933", "sunglass"], "837": ["n04356056", "sunglasses"], "838": ["n04357314", "sunscreen"], "839": ["n04366367", "suspension_bridge"], "840": ["n04367480", "swab"], "841": ["n04370456", "sweatshirt"], "842": ["n04371430", "swimming_trunks"], "843": ["n04371774", "swing"], "844": ["n04372370", "switch"], "845": ["n04376876", "syringe"], "846": ["n04380533", "table_lamp"], "847": ["n04389033", "tank"], "848": ["n04392985", "tape_player"], "849": ["n04398044", "teapot"], "850": ["n04399382", "teddy"], "851": ["n04404412", "television"], "852": ["n04409515", "tennis_ball"], "853": ["n04417672", "thatch"], "854": ["n04418357", "theater_curtain"], "855": ["n04423845", "thimble"], "856": ["n04428191", "thresher"], "857": ["n04429376", "throne"], "858": ["n04435653", "tile_roof"], "859": ["n04442312", "toaster"], "860": ["n04443257", "tobacco_shop"], "861": ["n04447861", "toilet_seat"], "862": ["n04456115", "torch"], "863": ["n04458633", "totem_pole"], "864": ["n04461696", "tow_truck"], "865": ["n04462240", "toyshop"], "866": ["n04465501", "tractor"], "867": ["n04467665", "trailer_truck"], "868": ["n04476259", "tray"], "869": ["n04479046", "trench_coat"], "870": ["n04482393", "tricycle"], "871": ["n04483307", "trimaran"], "872": ["n04485082", "tripod"], "873": ["n04486054", "triumphal_arch"], "874": ["n04487081", "trolleybus"], "875": ["n04487394", "trombone"], "876": ["n04493381", "tub"], "877": ["n04501370", "turnstile"], "878": ["n04505470", "typewriter_keyboard"], "879": ["n04507155", "umbrella"], "880": ["n04509417", "unicycle"], "881": ["n04515003", "upright"], "882": ["n04517823", "vacuum"], "883": ["n04522168", "vase"], "884": ["n04523525", "vault"], "885": ["n04525038", "velvet"], "886": ["n04525305", "vending_machine"], "887": ["n04532106", "vestment"], "888": ["n04532670", "viaduct"], "889": ["n04536866", "violin"], "890": ["n04540053", "volleyball"], "891": ["n04542943", "waffle_iron"], "892": ["n04548280", "wall_clock"], "893": ["n04548362", "wallet"], "894": ["n04550184", "wardrobe"], "895": ["n04552348", "warplane"], "896": ["n04553703", "washbasin"], "897": ["n04554684", "washer"], "898": ["n04557648", "water_bottle"], "899": ["n04560804", "water_jug"], "900": ["n04562935", "water_tower"], "901": ["n04579145", "whiskey_jug"], "902": ["n04579432", "whistle"], "903": ["n04584207", "wig"], "904": ["n04589890", "window_screen"], "905": ["n04590129", "window_shade"], "906": ["n04591157", "Windsor_tie"], "907": ["n04591713", "wine_bottle"], "908": ["n04592741", "wing"], "909": ["n04596742", "wok"], "910": ["n04597913", "wooden_spoon"], "911": ["n04599235", "wool"], "912": ["n04604644", "worm_fence"], "913": ["n04606251", "wreck"], "914": ["n04612504", "yawl"], "915": ["n04613696", "yurt"], "916": ["n06359193", "web_site"], "917": ["n06596364", "comic_book"], "918": ["n06785654", "crossword_puzzle"], "919": ["n06794110", "street_sign"], "920": ["n06874185", "traffic_light"], "921": ["n07248320", "book_jacket"], "922": ["n07565083", "menu"], "923": ["n07579787", "plate"], "924": ["n07583066", "guacamole"], "925": ["n07584110", "consomme"], "926": ["n07590611", "hot_pot"], "927": ["n07613480", "trifle"], "928": ["n07614500", "ice_cream"], "929": ["n07615774", "ice_lolly"], "930": ["n07684084", "French_loaf"], "931": ["n07693725", "bagel"], "932": ["n07695742", "pretzel"], "933": ["n07697313", "cheeseburger"], "934": ["n07697537", "hotdog"], "935": ["n07711569", "mashed_potato"], "936": ["n07714571", "head_cabbage"], "937": ["n07714990", "broccoli"], "938": ["n07715103", "cauliflower"], "939": ["n07716358", "zucchini"], "940": ["n07716906", "spaghetti_squash"], "941": ["n07717410", "acorn_squash"], "942": ["n07717556", "butternut_squash"], "943": ["n07718472", "cucumber"], "944": ["n07718747", "artichoke"], "945": ["n07720875", "bell_pepper"], "946": ["n07730033", "cardoon"], "947": ["n07734744", "mushroom"], "948": ["n07742313", "Granny_Smith"], "949": ["n07745940", "strawberry"], "950": ["n07747607", "orange"], "951": ["n07749582", "lemon"], "952": ["n07753113", "fig"], "953": ["n07753275", "pineapple"], "954": ["n07753592", "banana"], "955": ["n07754684", "jackfruit"], "956": ["n07760859", "custard_apple"], "957": ["n07768694", "pomegranate"], "958": ["n07802026", "hay"], "959": ["n07831146", "carbonara"], "960": ["n07836838", "chocolate_sauce"], "961": ["n07860988", "dough"], "962": ["n07871810", "meat_loaf"], "963": ["n07873807", "pizza"], "964": ["n07875152", "potpie"], "965": ["n07880968", "burrito"], "966": ["n07892512", "red_wine"], "967": ["n07920052", "espresso"], "968": ["n07930864", "cup"], "969": ["n07932039", "eggnog"], "970": ["n09193705", "alp"], "971": ["n09229709", "bubble"], "972": ["n09246464", "cliff"], "973": ["n09256479", "coral_reef"], "974": ["n09288635", "geyser"], "975": ["n09332890", "lakeside"], "976": ["n09399592", "promontory"], "977": ["n09421951", "sandbar"], "978": ["n09428293", "seashore"], "979": ["n09468604", "valley"], "980": ["n09472597", "volcano"], "981": ["n09835506", "ballplayer"], "982": ["n10148035", "groom"], "983": ["n10565667", "scuba_diver"], "984": ["n11879895", "rapeseed"], "985": ["n11939491", "daisy"], "986": ["n12057211", "yellow_lady's_slipper"], "987": ["n12144580", "corn"], "988": ["n12267677", "acorn"], "989": ["n12620546", "hip"], "990": ["n12768682", "buckeye"], "991": ["n12985857", "coral_fungus"], "992": ["n12998815", "agaric"], "993": ["n13037406", "gyromitra"], "994": ["n13040303", "stinkhorn"], "995": ["n13044778", "earthstar"], "996": ["n13052670", "hen-of-the-woods"], "997": ["n13054560", "bolete"], "998": ["n13133613", "ear"], "999": ["n15075141", "toilet_tissue"]} diff --git a/film_efficientnet/film_conditioning_layer.py b/film_efficientnet/film_conditioning_layer.py new file mode 100644 index 0000000..5cace77 --- /dev/null +++ b/film_efficientnet/film_conditioning_layer.py @@ -0,0 +1,74 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ResNet variants model for Keras with Film-Conditioning. + +Related papers/blogs: +- https://arxiv.org/abs/1512.03385 +- https://arxiv.org/pdf/1603.05027v2.pdf +- http://torch.ch/blog/2016/02/04/resnets.html +- https://arxiv.org/abs/1709.07871 +""" +import tensorflow.compat.v2 as tf + +layers = tf.keras.layers + + +class FilmConditioning(tf.keras.layers.Layer): + """Layer that adds FiLM conditioning. + + This is intended to be applied after a convolutional layer. It will learn a + multiplicative and an additive factor to be applied to each channel of the + convolution's output. + + Conv layer can be rank 2 or 4. + + For further details, see: https://arxiv.org/abs/1709.07871 + """ + + def __init__(self, num_channels: int): + """Constructs a FiLM conditioning layer. + + Args: + num_channels: Number of filter channels to expect in the input. + """ + super().__init__() + # Note that we initialize with zeros because empirically we have found + # this works better than initializing with glorot. + self._projection_add = layers.Dense( + num_channels, + activation=None, + kernel_initializer='zeros', + bias_initializer='zeros') + self._projection_mult = layers.Dense( + num_channels, + activation=None, + kernel_initializer='zeros', + bias_initializer='zeros') + + def call(self, conv_filters: tf.Tensor, conditioning: tf.Tensor): + tf.debugging.assert_rank(conditioning, 2) + projected_cond_add = self._projection_add(conditioning) + projected_cond_mult = self._projection_mult(conditioning) + + if len(conv_filters.shape) == 4: + # [B, D] -> [B, 1, 1, D] + projected_cond_add = projected_cond_add[:, tf.newaxis, tf.newaxis] + projected_cond_mult = projected_cond_mult[:, tf.newaxis, tf.newaxis] + else: + tf.debugging.assert_rank(conv_filters, 2) + + # Original FiLM paper argues that 1 + gamma centers the initialization at + # identity transform. + result = (1 + projected_cond_mult) * conv_filters + projected_cond_add + return result diff --git a/film_efficientnet/film_conditioning_layer_test.py b/film_efficientnet/film_conditioning_layer_test.py new file mode 100644 index 0000000..486d6c8 --- /dev/null +++ b/film_efficientnet/film_conditioning_layer_test.py @@ -0,0 +1,40 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for film_conditioning_layer.""" +from absl.testing import parameterized +import numpy as np +from robotics_transformer.film_efficientnet import film_conditioning_layer +import tensorflow as tf + + +class FilmConditioningLayerTest(tf.test.TestCase, parameterized.TestCase): + + @parameterized.parameters([2, 4]) + def test_film_conditioning_rank_two_and_four(self, conv_rank): + batch = 2 + num_channels = 3 + if conv_rank == 2: + conv_layer = np.random.randn(batch, num_channels) + elif conv_rank == 4: + conv_layer = np.random.randn(batch, 1, 1, num_channels) + else: + raise ValueError(f'Unexpected conv rank: {conv_rank}') + context = np.random.rand(batch, num_channels) + film_layer = film_conditioning_layer.FilmConditioning(num_channels) + out = film_layer(conv_layer, context) + tf.debugging.assert_rank(out, conv_rank) + + +if __name__ == '__main__': + tf.test.main() diff --git a/film_efficientnet/film_efficientnet_encoder.py b/film_efficientnet/film_efficientnet_encoder.py new file mode 100644 index 0000000..078ff43 --- /dev/null +++ b/film_efficientnet/film_efficientnet_encoder.py @@ -0,0 +1,759 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pytype: skip-file +# pylint: skip-file +"""EfficientNet models modified with added film layers. + +Mostly copied from third_party/py/keras/applications/efficientnet.py +""" + +import copy +import math +import os +import warnings +import json + +from absl import logging +import tensorflow.compat.v2 as tf +from tensorflow.keras import layers + +from robotics_transformer.film_efficientnet.film_conditioning_layer import FilmConditioning + +BASE_WEIGHTS_PATH = 'efficientnet_checkpoints/efficientnet' +IMAGENET_JSON_PATH = 'efficientnet_checkpoints/imagenet_classes.json' +CLASS_INDEX = None + +WEIGHTS_PATHS = { + 'efficientnetb3': BASE_WEIGHTS_PATH + 'b3.h5', + 'efficientnetb3_notop': BASE_WEIGHTS_PATH + 'b3_notop.h5', +} + +DEFAULT_BLOCKS_ARGS = [{ + 'kernel_size': 3, + 'repeats': 1, + 'filters_in': 32, + 'filters_out': 16, + 'expand_ratio': 1, + 'id_skip': True, + 'strides': 1, + 'se_ratio': 0.25 +}, { + 'kernel_size': 3, + 'repeats': 2, + 'filters_in': 16, + 'filters_out': 24, + 'expand_ratio': 6, + 'id_skip': True, + 'strides': 2, + 'se_ratio': 0.25 +}, { + 'kernel_size': 5, + 'repeats': 2, + 'filters_in': 24, + 'filters_out': 40, + 'expand_ratio': 6, + 'id_skip': True, + 'strides': 2, + 'se_ratio': 0.25 +}, { + 'kernel_size': 3, + 'repeats': 3, + 'filters_in': 40, + 'filters_out': 80, + 'expand_ratio': 6, + 'id_skip': True, + 'strides': 2, + 'se_ratio': 0.25 +}, { + 'kernel_size': 5, + 'repeats': 3, + 'filters_in': 80, + 'filters_out': 112, + 'expand_ratio': 6, + 'id_skip': True, + 'strides': 1, + 'se_ratio': 0.25 +}, { + 'kernel_size': 5, + 'repeats': 4, + 'filters_in': 112, + 'filters_out': 192, + 'expand_ratio': 6, + 'id_skip': True, + 'strides': 2, + 'se_ratio': 0.25 +}, { + 'kernel_size': 3, + 'repeats': 1, + 'filters_in': 192, + 'filters_out': 320, + 'expand_ratio': 6, + 'id_skip': True, + 'strides': 1, + 'se_ratio': 0.25 +}] + +CONV_KERNEL_INITIALIZER = { + 'class_name': 'VarianceScaling', + 'config': { + 'scale': 2.0, + 'mode': 'fan_out', + 'distribution': 'truncated_normal' + } +} + +DENSE_KERNEL_INITIALIZER = { + 'class_name': 'VarianceScaling', + 'config': { + 'scale': 1. / 3., + 'mode': 'fan_out', + 'distribution': 'uniform' + } +} + +BASE_DOCSTRING = """Instantiates the {name} architecture. + + Reference: + - [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks]( + https://arxiv.org/abs/1905.11946) (ICML 2019) + + This function returns a Keras image classification model, + optionally loaded with weights pre-trained on ImageNet. + + For image classification use cases, see + [this page for detailed examples]( + https://keras.io/api/applications/#usage-examples-for-image-classification-models). + + For transfer learning use cases, make sure to read the + [guide to transfer learning & fine-tuning]( + https://keras.io/guides/transfer_learning/). + + Note: each Keras Application expects a specific kind of input preprocessing. + For EfficientNet, input preprocessing is included as part of the model + (as a `Rescaling` layer), and thus + `tf.keras.applications.efficientnet.preprocess_input` is actually a + pass-through function. EfficientNet models expect their inputs to be float + tensors of pixels with values in the [0-255] range. + + Args: + include_top: Whether to include the fully-connected + layer at the top of the network. Defaults to True. + weights: One of `None` (random initialization), + 'imagenet' (pre-training on ImageNet), + or the path to the weights file to be loaded. Defaults to 'imagenet'. + input_tensor: Optional Keras tensor + (i.e. output of `layers.Input()`) + to use as image input for the model. + input_shape: Optional shape tuple, only to be specified + if `include_top` is False. + It should have exactly 3 inputs channels. + pooling: Optional pooling mode for feature extraction + when `include_top` is `False`. Defaults to None. + - `None` means that the output of the model will be + the 4D tensor output of the + last convolutional layer. + - `avg` means that global average pooling + will be applied to the output of the + last convolutional layer, and thus + the output of the model will be a 2D tensor. + - `max` means that global max pooling will + be applied. + classes: Optional number of classes to classify images + into, only to be specified if `include_top` is True, and + if no `weights` argument is specified. Defaults to 1000 (number of + ImageNet classes). + classifier_activation: A `str` or callable. The activation function to use + on the "top" layer. Ignored unless `include_top=True`. Set + `classifier_activation=None` to return the logits of the "top" layer. + Defaults to 'softmax'. + When loading pretrained weights, `classifier_activation` can only + be `None` or `"softmax"`. + + Returns: + A `keras.Model` instance. +""" + +IMAGENET_STDDEV_RGB = [0.229, 0.224, 0.225] + + +def validate_activation(classifier_activation, weights): + """validates that the classifier is compatible with the weights. + + Args: + classifier_activation: str or callable activation function + weights: The pretrained weights to load. + + Raises: + ValueError: if an activation other than `None` or `softmax` are used with + pretrained weights. + """ + if weights is None: + return + + classifier_activation = tf.keras.activations.get(classifier_activation) + if classifier_activation not in { + tf.keras.activations.get('softmax'), + tf.keras.activations.get(None) + }: + raise ValueError('Only `None` and `softmax` activations are allowed ' + 'for the `classifier_activation` argument when using ' + 'pretrained weights, with `include_top=True`; Received: ' + f'classifier_activation={classifier_activation}') + + +def correct_pad(inputs, kernel_size): + """Returns a tuple for zero-padding for 2D convolution with downsampling. + + Args: + inputs: Input tensor. + kernel_size: An integer or tuple/list of 2 integers. + + Returns: + A tuple. + """ + img_dim = 2 if tf.keras.backend.image_data_format() == 'channels_first' else 1 + input_size = tf.keras.backend.int_shape(inputs)[img_dim:(img_dim + 2)] + if isinstance(kernel_size, int): + kernel_size = (kernel_size, kernel_size) + if input_size[0] is None: + adjust = (1, 1) + else: + adjust = (1 - input_size[0] % 2, 1 - input_size[1] % 2) + correct = (kernel_size[0] // 2, kernel_size[1] // 2) + return ((correct[0] - adjust[0], correct[0]), (correct[1] - adjust[1], + correct[1])) + + +def obtain_input_shape(input_shape, + default_size, + min_size, + data_format, + require_flatten, + weights=None): + """Internal utility to compute/validate a model's input shape. + + Args: + input_shape: Either None (will return the default network input shape), or a + user-provided shape to be validated. + default_size: Default input width/height for the model. + min_size: Minimum input width/height accepted by the model. + data_format: Image data format to use. + require_flatten: Whether the model is expected to be linked to a classifier + via a Flatten layer. + weights: One of `None` (random initialization) or 'imagenet' (pre-training + on ImageNet). If weights='imagenet' input channels must be equal to 3. + + Returns: + An integer shape tuple (may include None entries). + + Raises: + ValueError: In case of invalid argument values. + """ + if weights != 'imagenet' and input_shape and len(input_shape) == 3: + if data_format == 'channels_first': + if input_shape[0] not in {1, 3}: + warnings.warn( + 'This model usually expects 1 or 3 input channels. ' + 'However, it was passed an input_shape with ' + + str(input_shape[0]) + ' input channels.', + stacklevel=2) + default_shape = (input_shape[0], default_size, default_size) + else: + if input_shape[-1] not in {1, 3}: + warnings.warn( + 'This model usually expects 1 or 3 input channels. ' + 'However, it was passed an input_shape with ' + + str(input_shape[-1]) + ' input channels.', + stacklevel=2) + default_shape = (default_size, default_size, input_shape[-1]) + else: + if data_format == 'channels_first': + default_shape = (3, default_size, default_size) + else: + default_shape = (default_size, default_size, 3) + if weights == 'imagenet' and require_flatten: + if input_shape is not None: + if input_shape != default_shape: + raise ValueError('When setting `include_top=True` ' + 'and loading `imagenet` weights, ' + f'`input_shape` should be {default_shape}. ' + f'Received: input_shape={input_shape}') + return default_shape + if input_shape: + if data_format == 'channels_first': + if input_shape is not None: + if len(input_shape) != 3: + raise ValueError('`input_shape` must be a tuple of three integers.') + if input_shape[0] != 3 and weights == 'imagenet': + raise ValueError('The input must have 3 channels; Received ' + f'`input_shape={input_shape}`') + if ((input_shape[1] is not None and input_shape[1] < min_size) or + (input_shape[2] is not None and input_shape[2] < min_size)): + raise ValueError(f'Input size must be at least {min_size}' + f'x{min_size}; Received: ' + f'input_shape={input_shape}') + else: + if input_shape is not None: + if len(input_shape) != 3: + raise ValueError('`input_shape` must be a tuple of three integers.') + if input_shape[-1] != 3 and weights == 'imagenet': + raise ValueError('The input must have 3 channels; Received ' + f'`input_shape={input_shape}`') + if ((input_shape[0] is not None and input_shape[0] < min_size) or + (input_shape[1] is not None and input_shape[1] < min_size)): + raise ValueError('Input size must be at least ' + f'{min_size}x{min_size}; Received: ' + f'input_shape={input_shape}') + else: + if require_flatten: + input_shape = default_shape + else: + if data_format == 'channels_first': + input_shape = (3, None, None) + else: + input_shape = (None, None, 3) + if require_flatten: + if None in input_shape: + raise ValueError('If `include_top` is True, ' + 'you should specify a static `input_shape`. ' + f'Received: input_shape={input_shape}') + return input_shape + + +def EfficientNet(width_coefficient, + depth_coefficient, + default_size, + dropout_rate=0.2, + drop_connect_rate=0.2, + depth_divisor=8, + activation='swish', + blocks_args='default', + model_name='efficientnet', + include_top=True, + weights='imagenet', + input_tensor=None, + input_shape=None, + pooling=None, + classes=1000, + classifier_activation='softmax', + include_film=False): + """Instantiates the EfficientNet architecture using given scaling coefficients. + + Args: + width_coefficient: float, scaling coefficient for network width. + depth_coefficient: float, scaling coefficient for network depth. + default_size: integer, default input image size. + dropout_rate: float, dropout rate before final classifier layer. + drop_connect_rate: float, dropout rate at skip connections. + depth_divisor: integer, a unit of network width. + activation: activation function. + blocks_args: list of dicts, parameters to construct block modules. + model_name: string, model name. + include_top: whether to include the fully-connected layer at the top of the + network. + weights: one of `None` (random initialization), 'imagenet' (pre-training on + ImageNet), or the path to the weights file to be loaded. + input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) to use + as image input for the model. + input_shape: optional shape tuple, only to be specified if `include_top` is + False. It should have exactly 3 inputs channels. + pooling: optional pooling mode for feature extraction when `include_top` is + `False`. - `None` means that the output of the model will be the 4D tensor + output of the last convolutional layer. - `avg` means that global average + pooling will be applied to the output of the last convolutional layer, and + thus the output of the model will be a 2D tensor. - `max` means that + global max pooling will be applied. + classes: optional number of classes to classify images into, only to be + specified if `include_top` is True, and if no `weights` argument is + specified. + classifier_activation: A `str` or callable. The activation function to use + on the "top" layer. Ignored unless `include_top=True`. Set + `classifier_activation=None` to return the logits of the "top" layer. + include_film: bool, whether or not to insert film conditioning layers. + + Returns: + A `keras.Model` instance. + + Raises: + ValueError: in case of invalid argument for `weights`, + or invalid input shape. + ValueError: if `classifier_activation` is not `softmax` or `None` when + using a pretrained top layer. + """ + if blocks_args == 'default': + blocks_args = DEFAULT_BLOCKS_ARGS + + if not (weights in {'imagenet', None} or tf.io.gfile.exists(weights)): + raise ValueError('The `weights` argument should be either ' + '`None` (random initialization), `imagenet` ' + '(pre-training on ImageNet), ' + 'or the path to the weights file to be loaded.') + + if weights == 'imagenet' and include_top and classes != 1000: + raise ValueError('If using `weights` as `"imagenet"` with `include_top`' + ' as true, `classes` should be 1000') + + # Determine proper input shape + input_shape = obtain_input_shape( + input_shape, + default_size=default_size, + min_size=32, + data_format=tf.keras.backend.image_data_format(), + require_flatten=include_top, + weights=weights) + + if include_film: + with tf.compat.v1.variable_scope('context_input'): + context_input = layers.Input(shape=512) + if input_tensor is None: + img_input = layers.Input(shape=input_shape) + else: + if not tf.keras.backend.is_keras_tensor(input_tensor): + img_input = layers.Input(tensor=input_tensor, shape=input_shape) + else: + img_input = input_tensor + + bn_axis = 3 if tf.keras.backend.image_data_format() == 'channels_last' else 1 + + def round_filters(filters, divisor=depth_divisor): + """Round number of filters based on depth multiplier.""" + filters *= width_coefficient + new_filters = max(divisor, int(filters + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_filters < 0.9 * filters: + new_filters += divisor + return int(new_filters) + + def round_repeats(repeats): + """Round number of repeats based on depth multiplier.""" + return int(math.ceil(depth_coefficient * repeats)) + + # Build stem + x = img_input + x = layers.Rescaling(1. / 255.)(x) + x = layers.Normalization(axis=bn_axis)(x) + # Note that the normaliztion layer uses square value of STDDEV as the + # variance for the layer: result = (input - mean) / sqrt(var) + # However, the original implemenetation uses (input - mean) / var to + # normalize the input, we need to divide another sqrt(var) to match the + # original implementation. + # See https://github.com/tensorflow/tensorflow/issues/49930 for more details + # We always apply this transformation, even when not using imagenet weights, + # because it needs to be in the graph when grafting weights from imagenet + # pretrained models. + x = layers.Rescaling(1. / tf.math.sqrt(IMAGENET_STDDEV_RGB))(x) + + x = layers.ZeroPadding2D(padding=correct_pad(x, 3), name='stem_conv_pad')(x) + x = layers.Conv2D( + round_filters(32), + 3, + strides=2, + padding='valid', + use_bias=False, + kernel_initializer=CONV_KERNEL_INITIALIZER, + name='stem_conv')( + x) + x = layers.BatchNormalization(axis=bn_axis, name='stem_bn')(x) + x = layers.Activation(activation, name='stem_activation')(x) + + # Build blocks + blocks_args = copy.deepcopy(blocks_args) + + b = 0 + blocks = float(sum(round_repeats(args['repeats']) for args in blocks_args)) + for (i, args) in enumerate(blocks_args): + assert args['repeats'] > 0 + # Update block input and output filters based on depth multiplier. + args['filters_in'] = round_filters(args['filters_in']) + args['filters_out'] = round_filters(args['filters_out']) + + for j in range(round_repeats(args.pop('repeats'))): + # The first block needs to take care of stride and filter size increase. + if j > 0: + args['strides'] = 1 + args['filters_in'] = args['filters_out'] + x = block( + x, + activation, + drop_connect_rate * b / blocks, + name='block{}{}_'.format(i + 1, chr(j + 97)), + **args) + if include_film: + with tf.compat.v1.variable_scope('film_conditioning'): + x = FilmConditioning(num_channels=x.shape[-1])(x, context_input) + b += 1 + + # Build top + x = layers.Conv2D( + round_filters(1280), + 1, + padding='same', + use_bias=False, + kernel_initializer=CONV_KERNEL_INITIALIZER, + name='top_conv')( + x) + x = layers.BatchNormalization(axis=bn_axis, name='top_bn')(x) + x = layers.Activation(activation, name='top_activation')(x) + if include_top: + x = layers.GlobalAveragePooling2D(name='avg_pool')(x) + if dropout_rate > 0: + x = layers.Dropout(dropout_rate, name='top_dropout')(x) + validate_activation(classifier_activation, weights) + x = layers.Dense( + classes, + activation=classifier_activation, + kernel_initializer=DENSE_KERNEL_INITIALIZER, + name='predictions')( + x) + else: + if pooling == 'avg': + x = layers.GlobalAveragePooling2D(name='avg_pool')(x) + elif pooling == 'max': + x = layers.GlobalMaxPooling2D(name='max_pool')(x) + + # Ensure that the model takes into account + # any potential predecessors of `input_tensor`. + if input_tensor is not None: + inputs = tf.keras.utils.get_source_inputs(input_tensor) + else: + inputs = img_input + if include_film: + inputs = (img_input, context_input) + + # Create model. + model = tf.keras.Model(inputs, x, name=model_name) + + # Load weights. + if weights == 'imagenet': + if include_top: + key = model_name + else: + key = model_name + '_notop' + weights_path = os.path.join(os.path.dirname(__file__), WEIGHTS_PATHS[key]) + model.load_weights(weights_path, skip_mismatch=False, by_name=False) + elif weights is not None: + model.load_weights(weights, skip_mismatch=False, by_name=False) + return model + + +def block(inputs, + activation='swish', + drop_rate=0., + name='', + filters_in=32, + filters_out=16, + kernel_size=3, + strides=1, + expand_ratio=1, + se_ratio=0., + id_skip=True): + """An inverted residual block. + + Args: + inputs: input tensor. + activation: activation function. + drop_rate: float between 0 and 1, fraction of the input units to drop. + name: string, block label. + filters_in: integer, the number of input filters. + filters_out: integer, the number of output filters. + kernel_size: integer, the dimension of the convolution window. + strides: integer, the stride of the convolution. + expand_ratio: integer, scaling coefficient for the input filters. + se_ratio: float between 0 and 1, fraction to squeeze the input filters. + id_skip: boolean. + + Returns: + output tensor for the block. + """ + bn_axis = 3 if tf.keras.backend.image_data_format() == 'channels_last' else 1 + + # Expansion phase + filters = filters_in * expand_ratio + if expand_ratio != 1: + x = layers.Conv2D( + filters, + 1, + padding='same', + use_bias=False, + kernel_initializer=CONV_KERNEL_INITIALIZER, + name=name + 'expand_conv')( + inputs) + x = layers.BatchNormalization(axis=bn_axis, name=name + 'expand_bn')(x) + x = layers.Activation(activation, name=name + 'expand_activation')(x) + else: + x = inputs + + # Depthwise Convolution + if strides == 2: + x = layers.ZeroPadding2D( + padding=correct_pad(x, kernel_size), name=name + 'dwconv_pad')( + x) + conv_pad = 'valid' + else: + conv_pad = 'same' + x = layers.DepthwiseConv2D( + kernel_size, + strides=strides, + padding=conv_pad, + use_bias=False, + depthwise_initializer=CONV_KERNEL_INITIALIZER, + name=name + 'dwconv')( + x) + x = layers.BatchNormalization(axis=bn_axis, name=name + 'bn')(x) + x = layers.Activation(activation, name=name + 'activation')(x) + + # Squeeze and Excitation phase + if 0 < se_ratio <= 1: + filters_se = max(1, int(filters_in * se_ratio)) + se = layers.GlobalAveragePooling2D(name=name + 'se_squeeze')(x) + if bn_axis == 1: + se_shape = (filters, 1, 1) + else: + se_shape = (1, 1, filters) + se = layers.Reshape(se_shape, name=name + 'se_reshape')(se) + se = layers.Conv2D( + filters_se, + 1, + padding='same', + activation=activation, + kernel_initializer=CONV_KERNEL_INITIALIZER, + name=name + 'se_reduce')( + se) + se = layers.Conv2D( + filters, + 1, + padding='same', + activation='sigmoid', + kernel_initializer=CONV_KERNEL_INITIALIZER, + name=name + 'se_expand')( + se) + x = layers.multiply([x, se], name=name + 'se_excite') + + # Output phase + x = layers.Conv2D( + filters_out, + 1, + padding='same', + use_bias=False, + kernel_initializer=CONV_KERNEL_INITIALIZER, + name=name + 'project_conv')( + x) + x = layers.BatchNormalization(axis=bn_axis, name=name + 'project_bn')(x) + if id_skip and strides == 1 and filters_in == filters_out: + if drop_rate > 0: + x = layers.Dropout( + drop_rate, noise_shape=(None, 1, 1, 1), name=name + 'drop')( + x) + x = layers.add([x, inputs], name=name + 'add') + return x + + +def maybe_restore_with_film( + *args, + weights='imagenet', + include_film=False, + **kwargs, +): + n1 = EfficientNet(*args, weights=weights, include_film=False, **kwargs) + if not include_film: + return n1 + # Copy the model weights over to a new model. This is necessary + # in case we have inserted early film layers. In this case, + # the pretrained weights will fail to restore properly + # unless we do this trick. + n2 = EfficientNet(*args, weights=None, include_film=True, **kwargs) + # The layers without the film layers. + l1 = {l.name: l for l in n1.layers} + # The layers with the film layers. + l2 = {l.name: l for l in n2.layers} + for layer_name, layer in l2.items(): + if layer_name in l1: + layer.set_weights(l1[layer_name].get_weights()) + # Annoyingly, the rescaling and normalization layers get different names + # in each graph. + elif 'rescaling' in layer_name: + _, num = layer_name.split('_') + l1_layer_name = 'rescaling_' + str(int(num) - 2 or '') + l1_layer_name = l1_layer_name.rstrip('_') + layer.set_weights(l1[l1_layer_name].get_weights()) + elif 'normalization' in layer_name: + _, num = layer_name.split('_') + l1_layer_name = 'normalization_' + str(int(num) - 1 or '') + l1_layer_name = l1_layer_name.rstrip('_') + layer.set_weights(l1[l1_layer_name].get_weights()) + return n2 + + +def EfficientNetB3(include_top=True, + weights='imagenet', + input_tensor=None, + input_shape=None, + pooling=None, + classes=1000, + classifier_activation='softmax', + include_film=False, + **kwargs): + return maybe_restore_with_film( + 1.2, + 1.4, + 300, + 0.3, + model_name='efficientnetb3', + include_top=include_top, + weights=weights, + input_tensor=input_tensor, + input_shape=input_shape, + pooling=pooling, + classes=classes, + classifier_activation=classifier_activation, + include_film=include_film, + **kwargs) + + +EfficientNetB3.__doc__ = BASE_DOCSTRING.format(name='EfficientNetB3') + + +def preprocess_input(x, data_format=None): # pylint: disable=unused-argument + """A placeholder method for backward compatibility. + + The preprocessing logic has been included in the efficientnet model + implementation. Users are no longer required to call this method to normalize + the input data. This method does nothing and only kept as a placeholder to + align the API surface between old and new version of model. + + Args: + x: A floating point `numpy.array` or a `tf.Tensor`. + data_format: Optional data format of the image tensor/array. Defaults to + None, in which case the global setting `tf.keras.image_data_format() is + used (unless you changed it, it defaults to "channels_last").{mode} + + Returns: + Unchanged `numpy.array` or `tf.Tensor`. + """ + return x + + +def decode_predictions(preds, top=5): + global CLASS_INDEX + if CLASS_INDEX is None: + with open(os.path.join(os.path.dirname(__file__), IMAGENET_JSON_PATH)) as f: + CLASS_INDEX = json.load(f) + results = [] + for pred in preds: + top_indices = pred.argsort()[-top:][::-1] + result = [tuple(CLASS_INDEX[str(i)]) + (pred[i],) for i in top_indices] + result.sort(key=lambda x: x[2], reverse=True) + results.append(result) + return results diff --git a/film_efficientnet/film_efficientnet_encoder_test.py b/film_efficientnet/film_efficientnet_encoder_test.py new file mode 100644 index 0000000..fb271c8 --- /dev/null +++ b/film_efficientnet/film_efficientnet_encoder_test.py @@ -0,0 +1,73 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests that film_efficientnet can detect an image of a cat.""" + +from absl.testing import parameterized +import numpy as np +from robotics_transformer.film_efficientnet import film_efficientnet_encoder +from skimage import data +import tensorflow as tf + + +class FilmEfficientnetTest(tf.test.TestCase, parameterized.TestCase): + + def _helper(self, include_film, model_variant): + if model_variant == 'b0': + size = 224 + fe = film_efficientnet_encoder.EfficientNetB0 + elif model_variant == 'b1': + size = 240 + fe = film_efficientnet_encoder.EfficientNetB1 + elif model_variant == 'b2': + size = 260 + fe = film_efficientnet_encoder.EfficientNetB2 + elif model_variant == 'b3': + size = 300 + fe = film_efficientnet_encoder.EfficientNetB3 + elif model_variant == 'b4': + size = 380 + fe = film_efficientnet_encoder.EfficientNetB4 + elif model_variant == 'b5': + size = 456 + fe = film_efficientnet_encoder.EfficientNetB5 + elif model_variant == 'b6': + size = 528 + fe = film_efficientnet_encoder.EfficientNetB6 + elif model_variant == 'b7': + size = 600 + fe = film_efficientnet_encoder.EfficientNetB7 + else: + raise ValueError(f'Unknown variant: {model_variant}') + fe = fe(include_top=True, weights='imagenet', include_film=include_film) + image = np.expand_dims(data.chelsea(), axis=0) + image = tf.image.resize(image, (size, size)) + context = np.random.randn(1, 512) + if include_film: + eff_output = fe( + (film_efficientnet_encoder.preprocess_input(image), context), + training=False) + else: + eff_output = fe( + film_efficientnet_encoder.preprocess_input(image), training=False) + film_preds = film_efficientnet_encoder.decode_predictions( + eff_output.numpy(), top=10) + self.assertIn('tabby', [f[1] for f in film_preds[0]]) + + @parameterized.parameters([True, False]) + def test_keras_equivalence_b3(self, include_film): + self._helper(include_film, 'b3') + + +if __name__ == '__main__': + tf.test.main() diff --git a/film_efficientnet/preprocessors.py b/film_efficientnet/preprocessors.py new file mode 100644 index 0000000..2b2ce01 --- /dev/null +++ b/film_efficientnet/preprocessors.py @@ -0,0 +1,108 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Preprocessing functions for transforming the image for training.""" + +from typing import Optional + +import gin +import tensorflow.compat.v2 as tf + +CROP_SIZE = 472 + + +@gin.configurable( + denylist=['images', 'crop_size', 'training', 'convert_dtype', 'seed']) +def convert_dtype_and_crop_images(images, + crop_size: int = CROP_SIZE, + training: bool = True, + pad_then_crop: bool = False, + convert_dtype: bool = True, + seed: Optional[tf.Tensor] = None): + """Convert uint8 [512, 640, 3] images to float32 and square crop. + + Args: + images: [B, H, W, 3] uint8 tensor of images. + crop_size: Width of the square crop. + training: If we are in training (random crop) or not-training (fixed crop). + pad_then_crop: If True, pads image and then crops the original image size. + This allows full field of view to be extracted. + convert_dtype: whether or not to convert the image to float32 in the range + of (0, 1). + seed: Optional seed of shape (2,) for giving to tf.random.stateless_uniform + + Returns: + [B, crop_size, crop_size, 3] images of dtype float32. + """ + + if seed is None: + seed = tf.random.uniform(shape=(2,), maxval=2**30, dtype=tf.int32) + + seed2 = tf.random.experimental.stateless_split(seed, num=1)[0] + + if convert_dtype: + images = tf.image.convert_image_dtype(images, tf.float32) + image_height = images.get_shape().as_list()[-3] + image_width = images.get_shape().as_list()[-2] + + if pad_then_crop: + + if training: + if image_height == 512: + ud_pad = 40 + lr_pad = 100 + elif image_height == 256: + ud_pad = 20 + lr_pad = 50 + else: + raise ValueError( + 'convert_dtype_and_crop_images only supports image height 512 or ' + '256.') + max_y = 2 * ud_pad + max_x = 2 * lr_pad + images = tf.image.pad_to_bounding_box( + images, + offset_height=ud_pad, + offset_width=lr_pad, + target_height=image_height + 2 * ud_pad, + target_width=image_width + 2 * lr_pad) + offset_y = tf.random.stateless_uniform((), + maxval=max_y + 1, + dtype=tf.int32, + seed=seed) + offset_x = tf.random.stateless_uniform((), + maxval=max_x + 1, + dtype=tf.int32, + seed=seed2) + images = tf.image.crop_to_bounding_box(images, offset_y, offset_x, + image_height, image_width) + else: + # Standard cropping. + max_y = image_height - crop_size + max_x = image_width - crop_size + + if training: + offset_y = tf.random.stateless_uniform((), + maxval=max_y + 1, + dtype=tf.int32, + seed=seed) + offset_x = tf.random.stateless_uniform((), + maxval=max_x + 1, + dtype=tf.int32, + seed=seed2) + images = tf.image.crop_to_bounding_box(images, offset_y, offset_x, + crop_size, crop_size) + else: + images = tf.image.crop_to_bounding_box(images, max_y // 2, max_x // 2, + crop_size, crop_size) + return images diff --git a/film_efficientnet/preprocessors_test.py b/film_efficientnet/preprocessors_test.py new file mode 100644 index 0000000..c61e5d6 --- /dev/null +++ b/film_efficientnet/preprocessors_test.py @@ -0,0 +1,83 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for preprocessors.""" +from absl.testing import parameterized +import numpy as np +from robotics_transformer.film_efficientnet import preprocessors +from tensor2robot.utils import tensorspec_utils +import tensorflow.compat.v2 as tf + + +def _random_image(shape): + images = tf.random.uniform( + shape, minval=0, maxval=255, dtype=tf.dtypes.int32, seed=42) + return tf.cast(images, tf.uint8) + + +def _get_features( + image_shape=(2, 512, 640, 3), use_task_image=False, use_goal_image=False): + # Time-dimension stacking occurs during training but not eval. + state = tensorspec_utils.TensorSpecStruct(image=_random_image(image_shape)) + if use_task_image: + state.task_image = _random_image(image_shape) + if use_goal_image: + state.goal_image = _random_image(image_shape) + return state + + +class PreprocessorsTest(tf.test.TestCase, parameterized.TestCase): + + @parameterized.parameters((True, False, False), (False, True, False), + (True, False, True), (False, True, True)) + def testConvertDtypeAndCropImages(self, training, pad_then_crop, + convert_dtype): + features = _get_features() + images = preprocessors.convert_dtype_and_crop_images( + features.image, + training=training, + pad_then_crop=pad_then_crop, + convert_dtype=convert_dtype) + expected_cropped_shape = ([2, 512, 640, 3] + if pad_then_crop else [2, 472, 472, 3]) + tf.ensure_shape(images, expected_cropped_shape) + if convert_dtype: + self.assertEqual(images.dtype, tf.float32) + self.assertLessEqual(images.numpy().max(), 1.) + self.assertGreaterEqual(images.numpy().min(), 0.) + else: + self.assertEqual(images.dtype, tf.uint8) + self.assertLessEqual(images.numpy().max(), 255) + self.assertGreaterEqual(images.numpy().min(), 0) + self.assertGreater(images.numpy().max(), 1) + + def testConvertDtypeAndCropImagesSeeded(self): + features = _get_features() + seed = tf.constant([1, 2], tf.int32) + images1 = preprocessors.convert_dtype_and_crop_images( + features.image, training=True, pad_then_crop=True, seed=seed) + images2 = preprocessors.convert_dtype_and_crop_images( + features.image, training=True, pad_then_crop=True, seed=seed) + diff = np.sum(np.abs(images1.numpy() - images2.numpy())) + self.assertAlmostEqual(diff, 0) + + def testConvertDtypeAndCropImagesUnseeded(self): + features = _get_features() + seed1 = tf.constant([1, 2], tf.int32) + images1 = preprocessors.convert_dtype_and_crop_images( + features.image, training=True, pad_then_crop=True, seed=seed1) + seed2 = tf.constant([2, 3], tf.int32) + images2 = preprocessors.convert_dtype_and_crop_images( + features.image, training=True, pad_then_crop=True, seed=seed2) + diff = np.sum(np.abs(images1.numpy() - images2.numpy())) + self.assertNotAlmostEqual(diff, 0) diff --git a/film_efficientnet/pretrained_efficientnet_encoder.py b/film_efficientnet/pretrained_efficientnet_encoder.py new file mode 100644 index 0000000..ad7ca70 --- /dev/null +++ b/film_efficientnet/pretrained_efficientnet_encoder.py @@ -0,0 +1,122 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Encoder based on Efficientnet.""" + +from typing import Optional + +import gin +from robotics_transformer.film_efficientnet import film_conditioning_layer +from robotics_transformer.film_efficientnet import film_efficientnet_encoder +import tensorflow as tf + +_MODELS = { + 'b3': film_efficientnet_encoder.EfficientNetB3, +} + +_SIZES = { + 'b3': 300, +} + + +@gin.configurable +class EfficientNetEncoder(tf.keras.layers.Layer): + """Applies a pretrained Efficientnet based encoder.""" + + def __init__(self, + model_variant: str = 'b3', + freeze: bool = False, + early_film: bool = True, + weights: Optional[str] = 'imagenet', + include_top: bool = False, + pooling: bool = True, + **kwargs): + """Initialize the model. + + Args: + model_variant: One of 'b0-b7' of the efficient encoders. See + https://arxiv.org/abs/1905.11946 to understand the variants. + freeze: Whether or not to freeze the pretrained weights (seems to not work + well). + early_film: Whether to inject film layers into the efficientnet encoder + (seems to be essential to getting strong performance). + weights: Which pretrained weights to use. Either 'imagenet', a path to the + pretrained weights, or None for from scratch. + include_top: Whether to add the top fully connected layer. If True, this + will cause encoding to fail and is used only for unit testing purposes. + pooling: If false, returns feature map before global average pooling + **kwargs: Keras specific layer kwargs. + """ + super(EfficientNetEncoder, self).__init__(**kwargs) + if model_variant not in _MODELS: + raise ValueError(f'Unknown variant {model_variant}') + self.model_variant = model_variant + self.early_film = early_film + self.freeze = freeze + self.conv1x1 = tf.keras.layers.Conv2D( + filters=512, + kernel_size=(1, 1), + strides=(1, 1), + padding='SAME', + use_bias=False, + kernel_initializer=tf.keras.initializers.VarianceScaling()) + self.net = _MODELS[model_variant]( + include_top=include_top, + weights=weights, + include_film=early_film, + ) + self.film_layer = film_conditioning_layer.FilmConditioning(num_channels=512) + self._pooling = pooling + + def _prepare_image(self, image: tf.Tensor) -> tf.Tensor: + """Resize the input image and check that the range is correct.""" + if len(image.shape) != 4 or image.shape[-1] != 3: + raise ValueError('Provided image should have shape (b, h, w, 3).') + size = _SIZES[self.model_variant] + if image.shape[1] < size / 4 or image.shape[2] < size / 4: + raise ValueError('Provided image is too small.') + if image.shape[1] > size * 4 or image.shape[2] > size * 4: + raise ValueError('Provided image is too large.') + image = tf.image.resize(image, (size, size)) + c1 = tf.Assert(tf.reduce_max(image) <= 1, data=[tf.reduce_max(image)]) + c2 = tf.Assert(tf.reduce_min(image) >= 0, data=[tf.reduce_min(image)]) + with tf.control_dependencies([c1, c2]): + image *= 255 # The image is expected to be in range(0, 255). + image = film_efficientnet_encoder.preprocess_input(image) + return image + + def _encode(self, image: tf.Tensor, context: tf.Tensor, + training: bool) -> tf.Tensor: + """Run the image through the efficientnet encoder.""" + image = self._prepare_image(image) + if self.early_film: + return self.net((image, context), training=training) + return self.net(image, training=training) + + def call(self, + image: tf.Tensor, + context: Optional[tf.Tensor] = None, + training: bool = True) -> tf.Tensor: + if self.freeze: + features = tf.stop_gradient(self._encode(image, context, training)) + else: + features = self._encode(image, context, training) + if context is not None: + features = self.conv1x1(features) + features = self.film_layer(features, context) + + if not self._pooling: + return features + + # Global average pool. + return tf.reduce_mean(features, [1, 2]) diff --git a/film_efficientnet/pretrained_efficientnet_encoder_test.py b/film_efficientnet/pretrained_efficientnet_encoder_test.py new file mode 100644 index 0000000..6b9b9ab --- /dev/null +++ b/film_efficientnet/pretrained_efficientnet_encoder_test.py @@ -0,0 +1,49 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for pretrained_efficientnet_encoder.""" + +import numpy as np +from robotics_transformer.film_efficientnet import film_efficientnet_encoder +from robotics_transformer.film_efficientnet import pretrained_efficientnet_encoder as eff +from skimage import data +import tensorflow as tf + + +class PretrainedEfficientnetEncoderTest(tf.test.TestCase): + + def test_encoding(self): + """Test that we get a correctly shaped decoding.""" + state = np.random.RandomState(0) + context = state.uniform(-1, 1, (10, 512)) + model = eff.EfficientNetEncoder() + image = np.expand_dims(data.chelsea(), axis=0) / 255 + preds = model(image, context, training=False).numpy() + self.assertEqual(preds.shape, (10, 512)) + + def test_imagenet_classification(self): + """Test that we can correctly classify an image of a cat.""" + state = np.random.RandomState(0) + context = state.uniform(-1, 1, (10, 512)) + model = eff.EfficientNetEncoder(include_top=True) + image = np.expand_dims(data.chelsea(), axis=0) / 255 + preds = model._encode(image, context, training=False).numpy() + predicted_names = [ + n[1] + for n in film_efficientnet_encoder.decode_predictions(preds, top=3)[0] + ] + self.assertIn('tabby', predicted_names) + + +if __name__ == '__main__': + tf.test.main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1a1324b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,44 @@ +[project] +name = "robotics_transformer" +description = "" +readme = "README.md" +requires-python = ">=3.7" +license = {file = "LICENSE"} +authors = [{name = "robotics_transformer authors", email="robotics_transformer@google.com"}] +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "License :: OSI Approved :: Apache Software License", + "Intended Audience :: Science/Research", +] +keywords = [] + +# pip dependencies of the project +dependencies = [] + +# This is set automatically by flit using `robotics_transformer.__version__` +dynamic = ["version"] + +[project.urls] +homepage = "https://github.com/google-research/robotics_transformer" +repository = "https://github.com/google-research/robotics_transformer" +# Other: `documentation`, `changelog` + +[project.optional-dependencies] +# Development deps (unittest, linting, formating,...) +# Installed through `pip install .[dev]` +dev = [ + "pytest", + "pytest-xdist", + "pylint>=2.6.0", + "pyink", +] + +[tool.pyink] +# Formatting configuration to follow Google style-guide +pyink-indentation = 2 +pyink-use-majority-quotes = true + +[build-system] +requires = ["flit_core >=3.5,<4"] +build-backend = "flit_core.buildapi" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c65dd3b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +absl-py>=0.5.0 +numpy>=1.13.3 +tensorflow>=1.13.0 +tensorflow-serving-api>=1.13.0 +gin-config>=0.1.4 +tensorflow-probability>=0.6.0 +tf-agents>=0.3.0 +tf-slim>=1.0 +git+https://github.com/google-research/tensor2robot#tensor2robot diff --git a/robotics_transformer.blueprint b/robotics_transformer.blueprint new file mode 100644 index 0000000..4ed6011 --- /dev/null +++ b/robotics_transformer.blueprint @@ -0,0 +1,19 @@ +include "devtools/blueprint/bluze/public/bluze.ncl"; +include bytes "third_party/py/robotics_transformer/bluze.textproto" as textproto; + +// See go/bluze/guide before editing. To check the generated final blueprint run +// rncl third_party/py/robotics_transformer/robotics_transformer.blueprint printproto blueprint_file + +blueprint_file = ::bluze::BlueprintFile( + textproto, + + project_name = "robotics_transformer", + teams_product_id = 9019942154, + tech_lead = ["keerthanapg"], + dev_mailing_list = "robotics_transformer-automated@google.com", + mdb_groups = ["robotics"], + buganizer_component_ids = [1150225], + metadata_path = "//depot/google3/third_party/py/robotics_transformer/METADATA", + +// Customize your blueprint here: go/blueprint/howto-write. +); diff --git a/sequence_agent.py b/sequence_agent.py new file mode 100644 index 0000000..dd1b207 --- /dev/null +++ b/sequence_agent.py @@ -0,0 +1,173 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Sequence policy and agent that directly output actions via actor network. + +These classes are not intended to change as they are generic enough for any +all-neural actor based agent+policy. All new features are intended to be +implemented in `actor_network` and `loss_fn`. + +# TODO(b/231896343): Update litred docs on how to use these. +""" +from typing import Optional, Type + +from absl import logging +import tensorflow as tf +from tf_agents.agents import data_converter +from tf_agents.agents import tf_agent +from tf_agents.networks import network +from tf_agents.policies import actor_policy +from tf_agents.trajectories import policy_step +from tf_agents.trajectories import time_step as ts +from tf_agents.typing import types +from tf_agents.utils import nest_utils + + +class SequencePolicy(actor_policy.ActorPolicy): + """A policy that directly outputs actions via an actor network.""" + + def __init__(self, **kwargs): + self._actions = None + super().__init__(**kwargs) + + def set_actions(self, actions): + self._actor_network.set_actions(actions) + + def get_actor_loss(self): + return self._actor_network.get_actor_loss() + + def get_aux_info(self): + return self._actor_network.get_aux_info() + + def set_training(self, training): + self._training = training + + def _action(self, + time_step: ts.TimeStep, + policy_state: types.NestedTensor, + seed: Optional[types.Seed] = None) -> policy_step.PolicyStep: + del seed + action, policy_state = self._apply_actor_network( + time_step.observation, + step_type=time_step.step_type, + policy_state=policy_state) + info = () + return policy_step.PolicyStep(action, policy_state, info) + + def _distribution(self, time_step, policy_state): + current_step = super()._distribution(time_step, policy_state) + return current_step + + +class SequenceAgent(tf_agent.TFAgent): + """A sequence agent that directly outputs actions via an actor network.""" + + def __init__(self, + time_step_spec: ts.TimeStep, + action_spec: types.NestedTensorSpec, + actor_network: Type[network.Network], + actor_optimizer: tf.keras.optimizers.Optimizer, + policy_cls: Type[actor_policy.ActorPolicy] = SequencePolicy, + time_sequence_length: int = 6, + debug_summaries: bool = False, + **kwargs): + self._info_spec = () + self._actor_network = actor_network( # pytype: disable=missing-parameter # dynamic-method-lookup + input_tensor_spec=time_step_spec.observation, + output_tensor_spec=action_spec, + policy_info_spec=self._info_spec, + train_step_counter=kwargs['train_step_counter'], + time_sequence_length=time_sequence_length) + + self._actor_optimizer = actor_optimizer + # Train policy is only used for loss and never exported as saved_model. + self._train_policy = policy_cls( + time_step_spec=time_step_spec, + action_spec=action_spec, + info_spec=self._info_spec, + actor_network=self._actor_network, + training=True) + collect_policy = policy_cls( + time_step_spec=time_step_spec, + action_spec=action_spec, + info_spec=self._info_spec, + actor_network=self._actor_network, + training=False) + super(SequenceAgent, self).__init__( + time_step_spec, + action_spec, + collect_policy, # We use the collect_policy as the eval policy. + collect_policy, + train_sequence_length=time_sequence_length, + **kwargs) + self._data_context = data_converter.DataContext( + time_step_spec=time_step_spec, + action_spec=action_spec, + info_spec=collect_policy.info_spec, + use_half_transition=True) + self.as_transition = data_converter.AsHalfTransition( + self._data_context, squeeze_time_dim=False) + self._debug_summaries = debug_summaries + + num_params = 0 + for weight in self._actor_network.trainable_weights: + weight_params = 1 + for dim in weight.shape: + weight_params *= dim + logging.info('%s has %s params.', weight.name, weight_params) + num_params += weight_params + logging.info('Actor network has %sM params.', round(num_params / 1000000., + 2)) + + def _train(self, experience: types.NestedTensor, + weights: types.Tensor) -> tf_agent.LossInfo: + self.train_step_counter.assign_add(1) + loss_info = self._loss(experience, weights, training=True) + self._apply_gradients(loss_info.loss) + return loss_info + + def _apply_gradients(self, loss: types.Tensor): + variables = self._actor_network.trainable_weights + gradients = tf.gradients(loss, variables) + # Skip nan and inf gradients. + new_gradients = [] + for g in gradients: + if g is not None: + new_g = tf.where( + tf.math.logical_or(tf.math.is_inf(g), tf.math.is_nan(g)), + tf.zeros_like(g), g) + new_gradients.append(new_g) + else: + new_gradients.append(g) + grads_and_vars = list(zip(new_gradients, variables)) + self._actor_optimizer.apply_gradients(grads_and_vars) + + def _loss(self, experience: types.NestedTensor, weights: types.Tensor, + training: bool) -> tf_agent.LossInfo: + transition = self.as_transition(experience) + time_steps, policy_steps, _ = transition + batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] + policy = self._train_policy + policy.set_actions(policy_steps.action) + policy.set_training(training=training) + with tf.name_scope('actor_loss'): + policy_state = policy.get_initial_state(batch_size) + policy.action(time_steps, policy_state=policy_state) + valid_mask = tf.cast(~time_steps.is_last(), tf.float32) + loss = valid_mask * policy.get_actor_loss() + loss = tf.reduce_mean(loss) + policy.set_actions(None) + self._actor_network.add_summaries(time_steps.observation, + policy.get_aux_info(), + self._debug_summaries, training) + return tf_agent.LossInfo(loss=loss, extra=loss) diff --git a/sequence_agent_test.py b/sequence_agent_test.py new file mode 100644 index 0000000..1eee013 --- /dev/null +++ b/sequence_agent_test.py @@ -0,0 +1,28 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for sequence_agent.""" +from robotics_transformer.sequence_agent_test_set_up import SequenceAgentTestSetUp +import tensorflow as tf +from tf_agents.agents import data_converter + + +class SequenceAgentTest(SequenceAgentTestSetUp): + + def testAsTransitionType(self): + agent = self.create_agent_and_initialize() + self.assertIsInstance(agent.as_transition, data_converter.AsHalfTransition) + + +if __name__ == '__main__': + tf.test.main() diff --git a/sequence_agent_test_set_up.py b/sequence_agent_test_set_up.py new file mode 100644 index 0000000..0e370f0 --- /dev/null +++ b/sequence_agent_test_set_up.py @@ -0,0 +1,144 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for sequence_agent.""" +from typing import Type + +import numpy as np +from robotics_transformer import sequence_agent +from tensor2robot.utils import tensorspec_utils +import tensorflow as tf +from tf_agents.networks import network +from tf_agents.policies import policy_saver +from tf_agents.specs import tensor_spec +from tf_agents.trajectories import time_step as ts + + +class DummyActorNet(network.Network): + """Used for testing SequenceAgent and its subclass.""" + + def __init__(self, + output_tensor_spec=None, + train_step_counter=None, + policy_info_spec=None, + time_sequence_length=1, + use_tcl=False, + **kwargs): + super().__init__(**kwargs) + + @property + def tokens_per_action(self): + return 8 + + def set_actions(self, actions): + self._actions = actions + + def get_actor_loss(self): + return self._actor_loss + + def call(self, + observations, + step_type, + network_state, + actions=None, + training=False): + del step_type + image = observations['image'] + tf.expand_dims(tf.reduce_mean(image, axis=-1), -1) + actions = tensorspec_utils.TensorSpecStruct( + world_vector=tf.constant(1., shape=[1, 3]), + rotation_delta=tf.constant(1., shape=[1, 3]), + terminate_episode=tf.constant(1, shape=[1, 2]), + gripper_closedness_action=tf.constant(1., shape=[1, 1]), + ) + return actions, network_state + + @property + def trainable_weights(self): + return [tf.Variable(1.0)] + + +class SequenceAgentTestSetUp(tf.test.TestCase): + """Defines spec for testing SequenceAgent and its subclass, tests create.""" + + def setUp(self): + super().setUp() + self._action_spec = tensorspec_utils.TensorSpecStruct() + self._action_spec.world_vector = tensor_spec.BoundedTensorSpec( + (3,), dtype=tf.float32, minimum=-1., maximum=1., name='world_vector') + + self._action_spec.rotation_delta = tensor_spec.BoundedTensorSpec( + (3,), + dtype=tf.float32, + minimum=-np.pi / 2, + maximum=np.pi / 2, + name='rotation_delta') + + self._action_spec.gripper_closedness_action = tensor_spec.BoundedTensorSpec( + (1,), + dtype=tf.float32, + minimum=-1., + maximum=1., + name='gripper_closedness_action') + self._action_spec.terminate_episode = tensor_spec.BoundedTensorSpec( + (2,), dtype=tf.int32, minimum=0, maximum=1, name='terminate_episode') + + state_spec = tensorspec_utils.TensorSpecStruct() + state_spec.image = tensor_spec.BoundedTensorSpec([256, 320, 3], + dtype=tf.float32, + name='image', + minimum=0., + maximum=1.) + state_spec.natural_language_embedding = tensor_spec.TensorSpec( + shape=[512], dtype=tf.float32, name='natural_language_embedding') + self._time_step_spec = ts.time_step_spec(observation_spec=state_spec) + + self.sequence_agent_cls = sequence_agent.SequenceAgent + + def create_agent_and_initialize(self, + actor_network: Type[ + network.Network] = DummyActorNet, + **kwargs): + """Creates the agent and initialize it.""" + agent = self.sequence_agent_cls( + time_step_spec=self._time_step_spec, + action_spec=self._action_spec, + actor_network=actor_network, + actor_optimizer=tf.keras.optimizers.Adam(), + train_step_counter=tf.compat.v1.train.get_or_create_global_step(), + **kwargs) + agent.initialize() + return agent + + def testCreateAgent(self): + """Creates the Agent and save the agent.policy.""" + agent = self.create_agent_and_initialize() + self.assertIsNotNone(agent.policy) + + policy_model_saver = policy_saver.PolicySaver( + agent.policy, + train_step=tf.compat.v2.Variable( + 0, + trainable=False, + dtype=tf.int64, + aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA, + shape=()), + input_fn_and_spec=None) + save_options = tf.saved_model.SaveOptions( + experimental_io_device='/job:localhost', + experimental_custom_gradients=False) + policy_model_saver.save('/tmp/unittest/policy/0', options=save_options) + + +if __name__ == '__main__': + tf.test.main() diff --git a/tokenizers/__init__.py b/tokenizers/__init__.py new file mode 100644 index 0000000..6d6d126 --- /dev/null +++ b/tokenizers/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tokenizers/action_tokenizer.py b/tokenizers/action_tokenizer.py new file mode 100644 index 0000000..321bffb --- /dev/null +++ b/tokenizers/action_tokenizer.py @@ -0,0 +1,157 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A simple action tokenizer used with Robotics Transformer 1. + +As an example, if an action is: +terminate = [0, 1] +world_vector = [0.9, 0.8, -0.3] +rotation_delta = [-0.1, 0.2, .6] +gripper_closedness = 0.9 + +Then we build a sequence of tokens of length 8 [one for each dimension]. +The int32 type action dimensions are already assumed discrete and tokenized, +the float dimensions are bucketed according to the specs min and max. Each +dimension has 'vocab_size' buckets. + +Currently, this tokenizer assumes one action spec and it is highly recommended +to specify the 'action_order', eg [terminate, world_vector, rotation_delta, +gripper_closedness]. Since after tokenization you lose that information, this +will be useful for debugging. Actions may also be subselected for prediction, +since not all actions are needed in the action_order. +""" +from typing import Optional + +from tensor2robot.utils import tensorspec_utils +import tensorflow as tf + + +class RT1ActionTokenizer: + """Tokenizes based on vocab size.""" + + def __init__(self, + action_spec: tensorspec_utils.TensorSpecStruct, + vocab_size: int, + action_order: Optional[list[str]] = None): + """Instantiates an RT1ActionTokenizer. + + Args: + action_spec: Tensor spec of the expected action tensor. + vocab_size: Number of buckets to discretize action to. + action_order: Order of the action names, used to discern the order of + tokenized actions to detokenize and assemble back to action tensor + """ + self._action_spec = action_spec + self._vocab_size = vocab_size + if action_order is None: + self._action_order = self._action_spec.keys() + else: + for action in action_order: + if action not in self._action_spec.keys(): + raise ValueError('actions: %s not found in action_spec: %s' % + (action, action_spec.keys())) + assert action in self._action_spec.keys() + self._action_order = action_order + self._tokens_per_action = 0 + for action in self._action_order: + action_shape = self._action_spec[action].shape + if len(action_shape) != 1: + raise ValueError( + 'Only action shapes with single dimension supported, got %s' % + action_shape) + if self._action_spec[action].dtype == tf.int32: + # Int32 actions are already assumed to be tokens. + self._tokens_per_action += 1 + else: + self._tokens_per_action += action_shape[0] + + # We measure # of action tokens in two different way. One is by checking + # from action_order (above) and the other is by looping through the + # action spec (below). We aseert the # of action tokens are the same + # calculated by these two ways. This will assure action_order is correctly + # configured, otherwise, it will through an error in the assert. + num_action_token = 0 + for spec in self._action_spec.values(): + if spec.dtype == tf.int32: + num_action_token += 1 + else: + num_action_token += spec.shape[-1] + tf.debugging.assert_equal(num_action_token, self._tokens_per_action) + + @property + def tokens_per_action(self) -> int: + return self._tokens_per_action + + @property + def action_spec(self) -> tensorspec_utils.TensorSpecStruct: + return self._action_spec + + @property + def action_order(self) -> list[str]: + return self._action_order + + def tokenize(self, action: tensorspec_utils.TensorSpecStruct) -> tf.Tensor: + """Tokenizes an action.""" + action_tokens = [] + for k in self._action_order: + a = action[k] # a is [batch, actions_size] + spec = self._action_spec[k] + if spec.dtype == tf.int32: + # Int32 actions are already assumed to be tokens, assume it is smaller + # than the vocab size, so all we need to do is pad zeros. + tf.debugging.assert_equal(1, tf.reduce_sum(a, axis=-1)) + # extract the token [batch, 1] + token = tf.argmax(a, axis=-1, output_type=tf.int32) + tf.debugging.assert_less(token, self._vocab_size) + # Add a seq dimension [batch, 1] + token = tf.expand_dims(token, axis=-1) + else: + a = tf.clip_by_value(a, spec.minimum, spec.maximum) + # Normalize the action [batch, actions_size] + token = (a - spec.minimum) / (spec.maximum - spec.minimum) + # Bucket and discretize the action to vocab_size, [batch, actions_size] + token = tf.cast(token * (self._vocab_size - 1), tf.int32) + action_tokens.append(token) + # Append all actions, [batch, all_actions_size] + action_tokens = tf.concat(action_tokens, axis=-1) + return action_tokens + + def detokenize(self, + action_tokens: tf.Tensor) -> tensorspec_utils.TensorSpecStruct: + """Detokenizes an action.""" + action = tensorspec_utils.TensorSpecStruct() + token_index = 0 + for k in self._action_order: + spec = self._action_spec[k] + action_dim = spec.shape[0] + if spec.dtype == tf.int32: + # Int32 actions are already assumed to be tokens. + action[k] = action_tokens[..., token_index] + # A poor model may output tokens outside the allowed range, in that case + # set them to a default value, the 0 token in this case. + outside_range = tf.greater_equal(action[k], action_dim) + action[k] = tf.where(outside_range, tf.zeros_like(action[k]), action[k]) + action[k] = tf.one_hot( + action[k], depth=action_dim, axis=-1, dtype=tf.int32) + token_index += 1 + else: + actions = [] + for _ in range(action_dim): + a = action_tokens[..., token_index:token_index + 1] + a = tf.cast(a, tf.float32) + a = a / (self._vocab_size - 1) + a = (a * (spec.maximum - spec.minimum)) + spec.minimum + actions.append(a) + token_index += 1 + action[k] = tf.concat(actions, axis=-1) + return action diff --git a/tokenizers/action_tokenizer_test.py b/tokenizers/action_tokenizer_test.py new file mode 100644 index 0000000..a0181a8 --- /dev/null +++ b/tokenizers/action_tokenizer_test.py @@ -0,0 +1,191 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for action_tokenizer.""" +import numpy as np +from robotics_transformer.tokenizers import action_tokenizer +from tensor2robot.utils import tensorspec_utils +import tensorflow as tf +from tf_agents.specs import tensor_spec + + +class ActionTokenizerTest(tf.test.TestCase): + + def testTokenize_int32(self): + action_spec = tensorspec_utils.TensorSpecStruct() + action_spec.terminate_episode = tensor_spec.BoundedTensorSpec( + (2,), dtype=tf.int32, minimum=0, maximum=1, name='terminate_episode') + tokenizer = action_tokenizer.RT1ActionTokenizer(action_spec, vocab_size=10) + self.assertEqual(1, tokenizer.tokens_per_action) + action = tensorspec_utils.TensorSpecStruct(terminate_episode=[0, 1]) + action_tokens = tokenizer.tokenize(action) + self.assertEqual([1], action_tokens.numpy()) + + def testTokenize_int32_not_one_hot(self): + action_spec = tensorspec_utils.TensorSpecStruct() + action_spec.terminate_episode = tensor_spec.BoundedTensorSpec( + (2,), dtype=tf.int32, minimum=0, maximum=1, name='terminate_episode') + tokenizer = action_tokenizer.RT1ActionTokenizer(action_spec, vocab_size=10) + self.assertEqual(1, tokenizer.tokens_per_action) + action = tensorspec_utils.TensorSpecStruct(terminate_episode=[1, 8]) + with self.assertRaises(tf.errors.InvalidArgumentError): + tokenizer.tokenize(action) + + def testDetokenize_int32(self): + action_spec = tensorspec_utils.TensorSpecStruct() + action_spec.terminate_episode = tensor_spec.BoundedTensorSpec( + (2,), dtype=tf.int32, minimum=0, maximum=1, name='terminate_episode') + tokenizer = action_tokenizer.RT1ActionTokenizer(action_spec, vocab_size=10) + # 0 token should become a one hot: [1, 0] + action = tokenizer.detokenize(tf.constant([0], dtype=tf.int32)) + self.assertSequenceEqual([1, 0], list(action['terminate_episode'].numpy())) + # 1 token should become a one hot: [0, 1] + action = tokenizer.detokenize(tf.constant([1], dtype=tf.int32)) + self.assertSequenceEqual([0, 1], list(action['terminate_episode'].numpy())) + # OOV 3 token should become a default one hot: [1, 0] + action = tokenizer.detokenize(tf.constant([3], dtype=tf.int32)) + self.assertSequenceEqual([1, 0], list(action['terminate_episode'].numpy())) + + def testTokenize_float(self): + action_spec = tensorspec_utils.TensorSpecStruct() + action_spec.world_vector = tensor_spec.BoundedTensorSpec( + (3,), dtype=tf.float32, minimum=-1., maximum=1., name='world_vector') + tokenizer = action_tokenizer.RT1ActionTokenizer(action_spec, vocab_size=10) + self.assertEqual(3, tokenizer.tokens_per_action) + action = tensorspec_utils.TensorSpecStruct(world_vector=[0.1, 0.5, -0.8]) + action_tokens = tokenizer.tokenize(action) + self.assertSequenceEqual([4, 6, 0], list(action_tokens.numpy())) + + def testTokenize_float_with_time_dimension(self): + action_spec = tensorspec_utils.TensorSpecStruct() + action_spec.world_vector = tensor_spec.BoundedTensorSpec( + (3,), dtype=tf.float32, minimum=-1., maximum=1., name='world_vector') + tokenizer = action_tokenizer.RT1ActionTokenizer(action_spec, vocab_size=10) + self.assertEqual(3, tokenizer.tokens_per_action) + batch_size = 2 + time_dimension = 3 + action = tensorspec_utils.TensorSpecStruct( + world_vector=tf.constant( + [[0.1, 0.5, -0.8], [0.1, 0.5, -0.8], [0.1, 0.5, -0.8], + [0.1, 0.5, -0.8], [0.1, 0.5, -0.8], [0.1, 0.5, -0.8]], + shape=[batch_size, time_dimension, tokenizer.tokens_per_action])) + action_tokens = tokenizer.tokenize(action) + self.assertSequenceEqual( + [batch_size, time_dimension, tokenizer.tokens_per_action], + action_tokens.shape.as_list()) + + def testTokenize_float_at_limits(self): + minimum = -1. + maximum = 1. + vocab_size = 10 + action_spec = tensorspec_utils.TensorSpecStruct() + action_spec.world_vector = tensor_spec.BoundedTensorSpec( + (2,), + dtype=tf.float32, + minimum=minimum, + maximum=maximum, + name='world_vector') + tokenizer = action_tokenizer.RT1ActionTokenizer( + action_spec, vocab_size=vocab_size) + self.assertEqual(2, tokenizer.tokens_per_action) + action = tensorspec_utils.TensorSpecStruct(world_vector=[minimum, maximum]) + action_tokens = tokenizer.tokenize(action) + # Minimum value will go to 0 + # Maximum value witll go to vocab_size-1 + self.assertSequenceEqual([0, vocab_size - 1], list(action_tokens.numpy())) + + def testTokenize_invalid_action_spec_shape(self): + action_spec = tensorspec_utils.TensorSpecStruct() + action_spec.world_vector = tensor_spec.BoundedTensorSpec( + (2, 2), dtype=tf.float32, minimum=1, maximum=-1, name='world_vector') + with self.assertRaises(ValueError): + action_tokenizer.RT1ActionTokenizer(action_spec, vocab_size=10) + + def testTokenizeAndDetokenizeIsEqual(self): + action_spec = tensorspec_utils.TensorSpecStruct() + + action_spec.world_vector = tensor_spec.BoundedTensorSpec( + (3,), dtype=tf.float32, minimum=-1., maximum=1., name='world_vector') + + action_spec.rotation_delta = tensor_spec.BoundedTensorSpec( + (3,), + dtype=tf.float32, + minimum=-np.pi / 2., + maximum=np.pi / 2., + name='rotation_delta') + + action_spec.gripper_closedness_action = tensor_spec.BoundedTensorSpec( + (1,), + dtype=tf.float32, + minimum=-1., + maximum=1., + name='gripper_closedness_action') + + num_sub_action_space = 2 + action_spec.terminate_episode = tensor_spec.BoundedTensorSpec( + (num_sub_action_space,), + dtype=tf.int32, + minimum=0, + maximum=1, + name='terminate_episode') + + tokenizer = action_tokenizer.RT1ActionTokenizer( + action_spec, + vocab_size=1024, + action_order=[ + 'terminate_episode', 'world_vector', 'rotation_delta', + 'gripper_closedness_action' + ]) + self.assertEqual(8, tokenizer.tokens_per_action) + + # Repeat the following test N times with fuzzy inputs. + n_repeat = 10 + for _ in range(n_repeat): + action = tensorspec_utils.TensorSpecStruct( + world_vector=np.random.uniform(low=-1., high=1.0, size=3), + rotation_delta=np.random.uniform( + low=-np.pi / 2., high=np.pi / 2., size=3), + gripper_closedness_action=np.random.uniform(low=0., high=1.0, size=1), + terminate_episode=[0, 1]) + action_tokens = tokenizer.tokenize(action) + policy_action = tokenizer.detokenize(action_tokens) + + for k in action: + self.assertSequenceAlmostEqual( + action[k], policy_action[k].numpy(), places=2) + + # Repeat the test with batched actions + batched_action = tensorspec_utils.TensorSpecStruct( + world_vector=[ + np.random.uniform(low=-1., high=1.0, size=3), + np.random.uniform(low=-1., high=1.0, size=3) + ], + rotation_delta=[ + np.random.uniform(low=-np.pi / 2., high=np.pi / 2., size=3), + np.random.uniform(low=-np.pi / 2., high=np.pi / 2., size=3) + ], + gripper_closedness_action=[ + np.random.uniform(low=0., high=1.0, size=1), + np.random.uniform(low=0., high=1.0, size=1) + ], + terminate_episode=[[0, 1], [1, 0]]) + action_tokens = tokenizer.tokenize(batched_action) + policy_action = tokenizer.detokenize(action_tokens) + + for k in batched_action: + for a, policy_a in zip(batched_action[k], policy_action[k].numpy()): + self.assertSequenceAlmostEqual(a, policy_a, places=2) + + +if __name__ == '__main__': + tf.test.main() diff --git a/tokenizers/image_tokenizer.py b/tokenizers/image_tokenizer.py new file mode 100644 index 0000000..ab6ccc7 --- /dev/null +++ b/tokenizers/image_tokenizer.py @@ -0,0 +1,112 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A FiLM Efficientnet contextual image tokenizer used in Robotics Transformer 1. +""" +from typing import Optional +from robotics_transformer.film_efficientnet import pretrained_efficientnet_encoder +from robotics_transformer.tokenizers import token_learner +import tensorflow as tf + + +class RT1ImageTokenizer(tf.keras.layers.Layer): + """Tokenizes based on vocab size.""" + + def __init__(self, + embedding_output_dim: int, + use_token_learner: bool = False, + num_tokens: int = 8, + **kwargs): + """Instantiates a RT1ImageTokenizer. + + Args: + embedding_output_dim: The output size of the tokens. + use_token_learner: Whether to use token learner. See + https://arxiv.org/abs/2106.11297 + num_tokens: Relevant only for token learner - the number of learned + tokens. + **kwargs: Keyword arguments to base class. + """ + super().__init__(**kwargs) + self._embedding_output_dim = embedding_output_dim + + self._tokenizer = pretrained_efficientnet_encoder.EfficientNetEncoder( + pooling=False, early_film=True) + + self._use_token_learner = use_token_learner + if self._use_token_learner: + self._num_tokens = num_tokens + self._token_learner = token_learner.TokenLearnerModule( + num_tokens=self._num_tokens) + + @property + def tokens_per_context_image(self) -> int: + if self._use_token_learner: + num_tokens = self._num_tokens + else: + num_tokens = 81 + return num_tokens + + def __call__(self, + image: tf.Tensor, + context: Optional[tf.Tensor] = None, + training: bool = False) -> tf.Tensor: + """Gets image tokens. + + Args: + image: Images of shape (b, t, h, w, 3) to tokenize. + context: An optional context vector (e.g., a natural language embedding). + Expected to have shape (b, t, embedding_dim). + training: Whether or not we are in training mode. + + Returns: + tokens: has shape (batch, t, num_tokens_per_timestep, embedding_dim) + """ + image_shape = tf.shape(image) + b = image_shape[0] + t = image_shape[1] + h = image_shape[2] + w = image_shape[3] + c = image_shape[4] + + # Fold the time axis into the batch axis. + image = tf.reshape(image, [b * t, h, w, c]) + if context is not None: + context_rank = tf.rank(context) + assertion = tf.Assert(context_rank == 3, data=[context_rank]) + with tf.control_dependencies([assertion]): + context = tf.reshape(context, [b * t, tf.shape(context)[-1]]) + tokens = self.get_image_embeddings(image, context, training) + if self._use_token_learner: + tokens = self._token_learner(tokens, training) + # Unflatten the time axis, which was previously flattened into the batch. + tokens = tf.reshape(tokens, [b, t, tf.shape(tokens)[1], -1]) + return tokens + + def get_image_embeddings(self, + image: tf.Tensor, + context: Optional[tf.Tensor], + training: bool = False) -> tf.Tensor: + """Gets embeddings from image. + + Args: + image: Expected to be float32 in range [0, 1] with shape (b, h, w, 3). + context: Expected to be float32 with shape (b, embedding_dim) + training: Whether or not we are in training mode. + + Returns: + tokens of shape (b, num_tokens, emedding_dim) + """ + image_tokens = self._tokenizer(image, context=context, training=training) + image_tokens = tf.reshape(image_tokens, [-1, 81, 512]) + return image_tokens diff --git a/tokenizers/image_tokenizer_test.py b/tokenizers/image_tokenizer_test.py new file mode 100644 index 0000000..79a404f --- /dev/null +++ b/tokenizers/image_tokenizer_test.py @@ -0,0 +1,46 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for image_tokenizer.""" +from absl.testing import parameterized +from robotics_transformer.tokenizers import image_tokenizer +import tensorflow as tf + + +class ImageTokenizerTest(tf.test.TestCase, parameterized.TestCase): + + @parameterized.named_parameters( + ('sample_image', 512, 224, False, 8), + ('sample_image_token_learner', 512, 224, True, 8)) + def testTokenize(self, output_dim, image_resolution, use_token_learner, + num_tokens): + batch = 1 + seq = 2 + tokenizer = image_tokenizer.RT1ImageTokenizer( + embedding_output_dim=output_dim, + use_token_learner=use_token_learner, + num_tokens=num_tokens) + + image = tf.random.normal( + shape=(batch, seq, image_resolution, image_resolution, 3)) + image = tf.clip_by_value(image, 0.0, 1.0) + context_vector = tf.random.uniform((batch, seq, 512)) + image_tokens = tokenizer(image, context_vector) + if use_token_learner: + self.assertEqual(image_tokens.shape, [batch, seq, num_tokens, 512]) + else: + self.assertEqual(image_tokens.shape, [batch, seq, 81, 512]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/tokenizers/token_learner.py b/tokenizers/token_learner.py new file mode 100644 index 0000000..ed1522a --- /dev/null +++ b/tokenizers/token_learner.py @@ -0,0 +1,128 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF implementation of Token Learner(Ryoo et al 2021).""" + +import functools +from typing import Optional, Sequence, Union +import numpy as np +import tensorflow as tf + + +def gelu(x: float) -> float: + return 0.5 * x * (1 + + tf.tanh(tf.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))) + + +def _maybe_dropout(rate: float = 0.0, name: str = "dropout"): + """Helper function to return dropout layer if rate is non zero.""" + if rate: + return tf.keras.layers.Dropout(rate, name=name) + return lambda x, *args: x # Does nothing to x. + + +class MlpBlock(tf.keras.layers.Layer): + """Transformer MLP / feed-forward block.""" + + def __init__(self, + *, + mlp_dim: int, + out_dim: Optional[int] = None, + kernel_init: Optional[tf.keras.initializers.Initializer] = tf + .keras.initializers.glorot_uniform(), + bias_init: Optional[tf.keras.initializers.Initializer] = tf.keras + .initializers.RandomNormal(stddev=1e-6), + dropout_rate: float = 0.1, + **kwargs): + """Initializer for the MLP Block. + + This computes outer_dense(gelu(hidden_dense(input))), with dropout + applied as necessary. + + Note: Especially outside a keras workflow, make sure to call layer.build + + Args: + mlp_dim: The dimension of the inner representation (output of hidden + layer). Usually larger than the input/output dim. + out_dim: The output dimension of the block. If None, the model output dim + is equal to the input dim (usually desired) + kernel_init: Initializer for dense kernels, used for both dense layers. + bias_init: Initializer for dense biases, used for both dense layers. + dropout_rate: Dropout rate to be applied after dense ( & activation) + **kwargs: Other keyword args passed to the tf.keras.layers.Layer + constructor e.g. the name + """ + super().__init__(**kwargs) + self._out_dim = out_dim + self._hidden_dropout = _maybe_dropout(dropout_rate) + self._output_dropout = _maybe_dropout(dropout_rate) + self._hidden_layer = tf.keras.layers.Dense( + mlp_dim, + activation=gelu, + kernel_initializer=kernel_init, + bias_initializer=bias_init, + name="hidden_dense") + + # If out_dim is None, infer out_dim = input_dim at self.build() + self._output_layer = functools.partial( + tf.keras.layers.Dense, + kernel_initializer=kernel_init, + bias_initializer=bias_init, + name="final_dense") + + def build(self, input_shape: Sequence[int]): + out_dim = self._out_dim or input_shape[-1] + self._output_layer = self._output_layer(units=out_dim) + super().build(input_shape) + + def call(self, + inputs: tf.Tensor, + *, + is_training: Union[bool, tf.Tensor] = False) -> tf.Tensor: + """Applies Transformer MlpBlock module.""" + x = self._hidden_layer(inputs) + x = self._hidden_dropout(x, is_training) + x = self._output_layer(x) + x = self._output_dropout(x, is_training) + return x + + +class TokenLearnerModule(tf.keras.layers.Layer): + """TokenLearner module V1.1 (https://arxiv.org/abs/2106.11297).""" + + def __init__(self, + num_tokens: int, + bottleneck_dim: int = 64, + dropout_rate: float = 0.): + super().__init__() + + self.mlp = MlpBlock( + mlp_dim=bottleneck_dim, out_dim=num_tokens, dropout_rate=dropout_rate) + self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6) + + def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor: + if len(inputs.shape) == 4: + bs, h, w, c = inputs.shape + inputs = tf.reshape(inputs, [bs, h * w, c]) + + selected = self.layernorm(inputs) + + selected = self.mlp( + selected, is_training=training) # Shape: [bs, h*w, n_token]. + + selected = tf.transpose(selected, [0, 2, 1]) # Shape: [bs, n_token, h*w]. + selected = tf.nn.softmax(selected, axis=-1) + + feat = tf.einsum("...si,...id->...sd", selected, inputs) + + return feat # Shape: [bs, n_token, c] diff --git a/tokenizers/token_learner_test.py b/tokenizers/token_learner_test.py new file mode 100644 index 0000000..d16e49a --- /dev/null +++ b/tokenizers/token_learner_test.py @@ -0,0 +1,37 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for token_learner.""" +from absl.testing import parameterized +from robotics_transformer.tokenizers import token_learner +import tensorflow as tf + + +class TokenLearnerTest(parameterized.TestCase): + + @parameterized.named_parameters(('sample_input', 512, 8)) + def testTokenLearner(self, embedding_dim, num_tokens): + batch = 1 + seq = 2 + token_learner_layer = token_learner.TokenLearnerModule( + num_tokens=num_tokens) + + inputvec = tf.random.normal(shape=(batch * seq, 81, embedding_dim)) + + learnedtokens = token_learner_layer(inputvec) + self.assertEqual(learnedtokens.shape, + [batch * seq, num_tokens, embedding_dim]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/transformer.py b/transformer.py new file mode 100644 index 0000000..23f117d --- /dev/null +++ b/transformer.py @@ -0,0 +1,169 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""RT1 decoder transformer. + +Copied from: +https://www.tensorflow.org/text/tutorials/transformer#decoder +""" +from typing import Tuple, Union + +import tensorflow as tf + + +class _TransformerLayer(tf.keras.layers.Layer): + """A single transformer block.""" + + def __init__(self, + layer_size: int = 4096, + num_heads: int = 8, + feed_forward_size: int = 512, + dropout_rate: float = 0.1, + return_attention_scores: bool = False): + """Creates a Transformer layer. + + Args: + layer_size: Size of the multiple head attention layer. + num_heads: Number of heads for the multiple head attention layer. + feed_forward_size: Dimensionality of the feed_forward layer. + dropout_rate: Dropout rate. + return_attention_scores: Return attention scores. + """ + super(_TransformerLayer, self).__init__() + + self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) + self.mha1 = tf.keras.layers.MultiHeadAttention( + key_dim=layer_size, num_heads=num_heads, dropout=dropout_rate) + self.ff = tf.keras.layers.Dense(feed_forward_size) + self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) + self.dropout_ff = tf.keras.layers.Dropout(dropout_rate) + self._return_attention_scores = return_attention_scores + + def call(self, x: tf.Tensor, attention_mask: tf.Tensor, + training: bool) -> Tuple[tf.Tensor, Union[tf.Tensor, None]]: + """Calls the layer. + + Args: + x: Input Tensor of shape `(B, T, dim)`. + attention_mask: a boolean mask of shape `(B, T, T)`, that prevents + attention to certain positions. The boolean mask specifies which query + elements can attend to which key elements, 1 indicates attention and 0 + indicates no attention. Broadcasting can happen for the missing batch + dimensions and the head dimension. + training: Python boolean indicating whether the layer should behave in + training mode (adding dropout) or in inference mode (no dropout). + + Returns: + y: Output Tensor of shape `(B, T, dim)`. Also return the attention scores + of shape `(B, T, dim)` or None. + """ + x1 = self.layernorm1(x) + mha_results = self.mha1( + query=x1, + key=x1, + value=x1, + attention_mask=attention_mask, + return_attention_scores=self._return_attention_scores, + training=training) + if self._return_attention_scores: + x1, score = mha_results + else: + x1, score = mha_results, None + + x = x + x1 + + y = self.layernorm2(x) + ff_y = self.ff(y) + ff_y = self.dropout_ff(ff_y, training=training) + x = x + ff_y + return x, score + + +class Transformer(tf.keras.layers.Layer): + """A decoder only transformer.""" + + def __init__(self, + num_layers: int = 1, + layer_size: int = 4096, + num_heads: int = 8, + feed_forward_size: int = 512, + dropout_rate: float = 0.1, + vocab_size: int = 256, + return_attention_scores: bool = False): + """Creates a transformer. + + Args: + num_layers: Number of transformer layers. + layer_size: Size of the multiple head attention layer. + num_heads: Number of heads for the multiple head attention layer. + feed_forward_size: Dimensionality of the feed_forward layer. + dropout_rate: Dropout rate. + vocab_size: Dimensionality of tokens from the output layer. + return_attention_scores: Return attention scores. + """ + super(Transformer, self).__init__() + + self._layers = [ + _TransformerLayer( # pylint: disable=g-complex-comprehension + layer_size=layer_size, + num_heads=num_heads, + feed_forward_size=feed_forward_size, + dropout_rate=dropout_rate, + return_attention_scores=return_attention_scores) + for _ in range(num_layers) + ] + self._token_emb = tf.keras.layers.Dense(feed_forward_size) + self._position_emb = tf.keras.layers.Dense(feed_forward_size) + self._output_tokens = tf.keras.layers.Dense(vocab_size) + + def call( + self, + x: tf.Tensor, + training: bool, + attention_mask: tf.Tensor, + ) -> Union[tf.Tensor, Tuple[tf.Tensor, list[tf.Tensor]]]: + """Calls the layer. + + Args: + x: Input Tensor of shape `(B, T, dim)`. + training: Python boolean indicating whether the layer should behave in + training mode (adding dropout) or in inference mode (no dropout). + attention_mask: a boolean mask of shape `(B, T, T)`, that prevents + attention to certain positions. The boolean mask specifies which query + elements can attend to which key elements, 1 indicates attention and 0 + indicates no attention. Broadcasting can happen for the missing batch + dimensions and the head dimension. + + Returns: + x: Output Tensor of shape `(B, T, vocab_size)`. If + `return_attention_scores`, also return attention scores of + a list of `layer` of elements with shape `(B, T, dim)`. + """ + + seq_len = tf.shape(x)[1] + batch_size = tf.shape(x)[0] + + positions = tf.one_hot( + tf.tile(tf.expand_dims(tf.range(0, seq_len, 1), 0), [batch_size, 1]), + seq_len) + + x = self._token_emb(x) + x += self._position_emb(positions) + scores = [] + + for layer in self._layers: + x, score = layer(x, attention_mask=attention_mask, training=training) + if score is not None: + scores.append(score) + x = self._output_tokens(x) + return x, scores diff --git a/transformer_network.py b/transformer_network.py new file mode 100644 index 0000000..e31bab8 --- /dev/null +++ b/transformer_network.py @@ -0,0 +1,689 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tensorflow based methods for sequence agents.""" +from typing import Optional, Tuple, Union, Any + +from absl import logging +import numpy as np + +from robotics_transformer import transformer +from robotics_transformer.film_efficientnet import preprocessors +from robotics_transformer.tokenizers import action_tokenizer +from robotics_transformer.tokenizers import image_tokenizer + +from tensor2robot.utils import tensorspec_utils +import tensorflow as tf +from tf_agents.networks import network +from tf_agents.specs import tensor_spec +from tf_agents.utils import nest_utils + + +class TransformerNetwork(network.Network): + """A transformer based actor network.""" + + def __init__( + self, + input_tensor_spec: tensorspec_utils.TensorSpecStruct, + output_tensor_spec: tensorspec_utils.TensorSpecStruct, + train_step_counter: int = 0, + vocab_size: int = 256, + token_embedding_size: int = 512, + num_layers: int = 1, + layer_size: int = 4096, + num_heads: int = 8, + feed_forward_size: int = 512, + dropout_rate: float = 0.1, + time_sequence_length: int = 1, + crop_size: int = 236, + policy_info_spec: Optional[dict[Any, + tensor_spec.BoundedTensorSpec]] = None, + action_order: Optional[list[str]] = None, + use_token_learner: Optional[bool] = True, + return_attention_scores: bool = False, + **kwargs): + """Creates a transformer network. + + Args: + input_tensor_spec: Nested list/tuple/dict of TensorSpecs, describing the + shape of input tensor. + output_tensor_spec: Nested list/tuple/dict of TensorSpecs, describing the + shape of output tensor. + train_step_counter: Counter for number of steps. + vocab_size: Dimensionality of tokens from the output layer. + token_embedding_size: Dimensionality of tokens from the embedding layer. + num_layers: Number of transformer layers. + layer_size: Size of the multiple head attention layer. + num_heads: Number of heads for the multiple head attention layer. + feed_forward_size: Dimensionality of the feed_forward layer. + dropout_rate: Dropout rate. + time_sequence_length: Length of the time sequence. + crop_size: Height and width of the square crop, where original image will + be padded to allow full field of view to be extracted. + policy_info_spec: Spec on return value given return type of the return + tokenizer. + action_order: Order of actions for the action tokenizer. + use_token_learner: Whether to use token learner. See + https://arxiv.org/abs/2106.11297 + return_attention_scores: show attention scores in tensorboard. + **kwargs: Keyword parameter arguments. + """ + self._input_tensor_spec = input_tensor_spec + self._output_tensor_spec = output_tensor_spec + self._train_step_counter = train_step_counter + self._actions = None + self._returns = None + self._vocab_size = vocab_size + self._token_embedding_size = token_embedding_size + self._time_sequence_length = time_sequence_length + self._crop_size = crop_size + + self._transformer = transformer.Transformer( + num_layers=num_layers, + layer_size=layer_size, + num_heads=num_heads, + feed_forward_size=feed_forward_size, + dropout_rate=dropout_rate, + vocab_size=self._vocab_size, + return_attention_scores=return_attention_scores) + + # create tokenizers + self._image_tokenizer = image_tokenizer.RT1ImageTokenizer( + embedding_output_dim=self._token_embedding_size, + use_token_learner=use_token_learner) + self._action_tokenizer = action_tokenizer.RT1ActionTokenizer( + output_tensor_spec, + vocab_size=self._vocab_size, + action_order=action_order) + + self._tokens_per_action = self._action_tokenizer.tokens_per_action + self._tokens_per_context_image = self._image_tokenizer.tokens_per_context_image + # generate loss and attention masks + self._generate_masks() + + # define mappings to token embedding size + self._action_token_emb = tf.keras.layers.Dense(self._token_embedding_size) + + # define loss function + self._loss_object = tf.keras.losses.SparseCategoricalCrossentropy( + from_logits=True, reduction=tf.keras.losses.Reduction.NONE) + self._attention_scores = [] + self._use_token_learner = use_token_learner + + super(TransformerNetwork, self).__init__( + input_tensor_spec=input_tensor_spec, **kwargs) + self._state_spec = { + # Force this to be 4 dimension due to b/254902773. + # Otherwise can be dimension 3. + 'context_image_tokens': + tensor_spec.TensorSpec( + shape=(time_sequence_length, self._tokens_per_context_image, 1, + token_embedding_size), + dtype=tf.float32, + name='context_image_tokens'), + 'action_tokens': + tensor_spec.TensorSpec( + shape=(time_sequence_length, self._tokens_per_action, 1, 1), + dtype=tf.int32, + name='action_tokens'), + # Stores where in the window we are. + # This value is within range [0, time_sequence_length + 1]. + # When seq_idx == time_sequence_length, context_image_tokens and + # action_tokens need to be shifted to the left. + 'seq_idx': + tensor_spec.TensorSpec( + shape=(1, 1, 1, 1), dtype=tf.int32, name='seq_idx') + } + + @property + def attention_scores(self) -> list[tf.Tensor]: + """Return attention score. This is for debugging/visualization purpose.""" + return self._attention_scores + + def _get_action_index_for_token(self, k): + """Returns action associated with the token at given position `k`. + + If k is not an action token then it returns -1. + If k is part of the first action in the sequence then returns 0 etc. + + Args: + k: an int that represents the position in the sequence. + + Returns: + The index of the action that this position belongs to, or if this + position is part of an image token then returns -1. + """ + if (k < 0 or k >= self._all_num_tokens): + return -1 + + n = k + if n % self._single_time_step_num_tokens < self._tokens_per_context_image: + return -1 + return int(n / self._single_time_step_num_tokens) + + def _generate_masks(self): + """Generate mask for action prediction loss and attention visualization.""" + # each time step = [image, action] + self._single_time_step_num_tokens = ( + self._tokens_per_action + self._tokens_per_context_image) + + # full sequence = [prefix context + N x timestep + postfix context] + self._all_num_tokens = ( + self._time_sequence_length * self._single_time_step_num_tokens) + + # create mask for action predition loss + self._action_tokens_mask = [] + for n in range(0, self._all_num_tokens, self._single_time_step_num_tokens): + for x in range(0, self._tokens_per_action, 1): + self._action_tokens_mask.append(x + n + self._tokens_per_context_image) + self._action_tokens_mask = tf.constant( + self._action_tokens_mask, dtype=tf.int32) + + # The look ahead mask ensures causality. + self._default_attention_mask = tf.linalg.band_part( + tf.ones((self._all_num_tokens, self._all_num_tokens)), -1, 0) + + action_mask = np.ndarray( + shape=(self._all_num_tokens, self._all_num_tokens), dtype=int) + for i in range(self._all_num_tokens): + for j in range(self._all_num_tokens): + action_i = self._get_action_index_for_token(i) + action_j = self._get_action_index_for_token(j) + mask = 0 + if action_i != -1 and action_j != -1: + # Ignore actions of previous steps. + if action_j < action_i: + mask = 1 + # If we're not auto-regression, ignore action dimensions of current + # step. + if (action_j == action_i and j <= i): + mask = 1 + action_mask[i, j] = mask + self._default_attention_mask -= action_mask + + def _transformer_call( + self, + context_image_tokens: tf.Tensor, + action_tokens: tf.Tensor, + batch_size: int, + training: bool, + attention_mask: tf.Tensor, + ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]: + """Calls the transformer. + + Args: + context_image_tokens: Tokenized context and image in Tensor of shape `(B, + T, num token, -1)`. + action_tokens: Discrete action token sequence of size [8, 256]. + batch_size: Batch size as when reshaping all tokens. + training: Whether to run the transformer in training mode. + attention_mask: Optional bool tensor for masking transformer's attention. + + Returns: + Output tokens in Tensor of shape `(B, T, dim)`. If + return_attention_scores, also return the attention scores of + shape `(B, T, dim)`. + """ + input_token_sequence = self._assemble_input_token_sequence( + context_image_tokens, action_tokens, batch_size) + + # run transformer + output_tokens, self._attention_scores = self._transformer( + input_token_sequence, training, attention_mask) + return output_tokens + + def _get_tokens_and_mask(self, + observations: dict[str, tf.Tensor], + network_state: dict[str, tf.Tensor], + training: bool = False): + # tokenize all inputs + context_image_tokens, network_state = self._tokenize_images( + observations, network_state, training) + action_tokens = self._tokenize_actions(observations, network_state) + + # generate transformer attention mask + attention_mask = self._default_attention_mask + + return (context_image_tokens, action_tokens, attention_mask) + + def _transformer_call_and_slice(self, + *args, + slice_start: int = 0, + slice_length: int = 1, + **kwargs) -> Tuple[tf.Tensor, tf.Tensor]: + output_tokens = self._transformer_call(*args, **kwargs) + + slice_end = slice_start + slice_length + token_logits = output_tokens[:, slice_start:slice_end, :] + token = tf.argmax(token_logits, axis=-1, output_type=tf.int32) + + return token, token_logits + + def call(self, + observations: dict[str, tf.Tensor], + network_state: dict[str, tf.Tensor], + training: bool = False): + """Calls the transformer network. + + Args: + observations: Observation data including image and natural language + embedding in dict of Tensors. + network_state: Network state data including time step, image, action + tokens, step number in dict of Tensors. + training: Whether to call transformer network in training mode. + + Returns: + A tuple `(Detokenized output actions, network state)`. + """ + # used to determine training vs inference call + # outer_rank will be 2 -> [b, t] during training and + # outer_rank will be 1 -> [b] during inference + outer_rank = self._get_outer_rank(observations) + assert outer_rank in (1, 2) + + b, t = self._get_batch_size_and_seq_len(network_state) + + context_image_tokens, action_tokens, attention_mask = self._get_tokens_and_mask( + observations, network_state, training) + + self._aux_info = {'action_labels': action_tokens} + + if outer_rank == 1: # This is an inference call + # run transformer in loop to produce action tokens one-by-one + # TODO(b/231896343): Document/comment more on what the following mess is. + seq_idx = tf.reshape(network_state['seq_idx'], [1])[0] + action_t = tf.minimum(seq_idx, self._time_sequence_length - 1) + # Transformer shifts all to the left by one step by default (it's usually + # predicting the next token as default training task...). + transformer_shift = -1 + # We only want to get the action predicted at time_step. + start_index = ( + transformer_shift + self._tokens_per_context_image + action_t * + (self._single_time_step_num_tokens)) + current_action_tokens = [] + action_predictions_logits = [] + for k in range(self._tokens_per_action): + action_index = start_index + k + token, token_logits = self._transformer_call_and_slice( + context_image_tokens, + action_tokens, + attention_mask=attention_mask, + batch_size=b, + training=training, + slice_start=action_index # slicing single action dimension + ) + action_predictions_logits.append(token_logits) + current_action_tokens.append(token) + # action_tokens is [b, t * self._tokens_per_action] + action_tokens = tf.reshape(action_tokens, [b, -1]) + action_start_index = (action_t * self._tokens_per_action) + k + action_tokens = tf.concat([ + action_tokens[:, :action_start_index], token, + action_tokens[:, action_start_index + 1:] + ], + axis=1) + # action_tokens is [b, t, self._tokens_per_action] + action_tokens = tf.reshape(action_tokens, + [b, t, self._tokens_per_action]) + self._aux_info.update({ + # action_predictions_logits is + # [b, self._tokens_per_action, self._vocab_size] + 'action_predictions_logits': tf.concat(action_predictions_logits, 1) + }) + # predicted_tokens_for_output is [b, self._tokens_per_action] + predicted_tokens_for_output = tf.concat(current_action_tokens, 1) + # state_action_tokens is [b, 1, self._tokens_per_action, 1, 1] + one_state_action_tokens = predicted_tokens_for_output[:, tf.newaxis, :, + tf.newaxis, + tf.newaxis] + + state_action_tokens = network_state['action_tokens'] + network_state['action_tokens'] = tf.concat([ + state_action_tokens[:, :action_t, ...], one_state_action_tokens, + state_action_tokens[:, action_t + 1:, ...] + ], + axis=1) + # Increment the time_step for the next inference call. + network_state['seq_idx'] = tf.reshape( + tf.minimum(seq_idx + 1, self._time_sequence_length), [-1, 1, 1, 1, 1]) + + self._loss = tf.constant(0.0) + else: + # training call --> simply run one transformer forward pass + output_tokens = self._transformer_call( + context_image_tokens, + action_tokens, + attention_mask=attention_mask, + batch_size=b, + training=training) + + # Gather all predicted actions for the action loss. + action_logits = tf.gather( + output_tokens, self._action_tokens_mask - 1, axis=1) + action_logits_for_training = tf.reshape( + action_logits, [b, t, self._tokens_per_action, -1]) + + # Only take the last action as the action. + # action_logits_for_output is [b, self._tokens_per_action, emb] + action_logits_for_output = action_logits_for_training[:, -1] + + # predicted_tokens_for_output is [b, self._tokens_per_action] + predicted_tokens_for_output = tf.argmax( + action_logits_for_output, axis=-1, output_type=tf.int32) + + num_items = ( + tf.cast(b * t, tf.float32) * self._single_time_step_num_tokens) + action_loss = tf.reduce_mean( + self._loss_object(action_tokens, action_logits_for_training) / + num_items, + axis=-1) + + self._loss = action_loss + + # store action labels and predictions for visualization + self._aux_info.update({ + 'action_predictions': + tf.argmax( + action_logits_for_training, axis=-1, output_type=tf.int32), + 'action_loss': + action_loss, + 'actor_loss_mask': + tf.ones([b], dtype=tf.float32) + }) + + output_actions = self._action_tokenizer.detokenize( + predicted_tokens_for_output) + return output_actions, network_state + + def add_summaries(self, observations: dict[str, tf.Tensor], + logging_info: dict[str, tf.Tensor], debug_summaries: bool, + training: bool) -> None: + """Adds summaries. + + Args: + observations: Observation data including image and natural language + instruction in dict of Tensors. + logging_info: Dict with all data stored for logging during training pass. + debug_summaries: Whether to include debug summaries. + training: Whether this function is called during training or inference. + """ + num_params = 0 + for weight in self.trainable_weights: + weight_params = 1 + for dim in weight.shape: + weight_params *= dim + num_params += weight_params + tf.compat.v2.summary.scalar(name='num_params', data=num_params) + # debug_summaries are for the non-tpu worker, train_summary. + if debug_summaries: + image = observations['image'] # [b, t, h, w, c] + image_h = image.shape[2] + image_w = image.shape[3] + batch_size = image.shape[0] + num_ts = image.shape[1] + logging.info('image shape %s', image.shape) + # Concat images for different timesteps across width. + image = tf.concat(tf.unstack(image, axis=1), 2) + # Concat images for different batches (up to 8) across height. + image = tf.expand_dims(tf.concat(tf.unstack(image, axis=0)[0:8], 0), 0) + tf.summary.image( + 'observations/image', + image, + step=self._train_step_counter, + # Single output since we have concatenated images along batch. + max_outputs=1) + + # [b, t], strings + if 'natural_language_instruction' in observations: + task = observations['natural_language_instruction'][:, 0] + tf.summary.text( + 'natural_language_instruction', task, step=self._train_step_counter) + if self.attention_scores and not self._use_token_learner: + for l_idx, layer_attention_score in enumerate(self.attention_scores): + logging.info('Attention score shape: %s, %s', l_idx, + layer_attention_score.shape) + for head_idx in range(layer_attention_score.shape[1]): + pairwise_attention = tf.expand_dims( + layer_attention_score[:, head_idx], -1) + # pairwise attention shape (16, 552, 552, 1) + # make attention from different time steps comparable + pairwise_attention = pairwise_attention * np.arange( + 1, pairwise_attention.shape[1] + 1)[None, :, None, None] + + # visualize spatial attention, note this only supports + # mk1_500tasks_transformer pipeline with no token learner + img_tf_ts = tf.reshape( + tf.transpose( + tf.reshape( + tf.reduce_sum(pairwise_attention, axis=1) / np.arange( + pairwise_attention.shape[1], 0, -1)[None, :, None], + [batch_size, num_ts, -1]), + [0, 2, 1])[:, :-self._tokens_per_action, :], + [-1, 9, 9, num_ts]) + + img_tf_ts = tf.image.resize( + img_tf_ts, [image_h, image_w], + method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) + img_tf_ts_concat = tf.concat(tf.unstack(img_tf_ts, axis=3), 2) + img_tf_ts_concat_min = tf.reduce_min( + img_tf_ts_concat, axis=[1, 2], keepdims=True) + img_tf_ts_concat = (img_tf_ts_concat - img_tf_ts_concat_min) / ( + tf.reduce_max(img_tf_ts_concat, axis=[1, 2], keepdims=True) - + img_tf_ts_concat_min) + img_tf_ts_concat = tf.concat( + tf.unstack(img_tf_ts_concat, axis=0)[:8], 0) + img_tf_ts_concat = tf.expand_dims( + tf.expand_dims(img_tf_ts_concat, 0), -1) + tf.summary.image( + 'attention/layer_{}/head_{}'.format(l_idx, head_idx), + img_tf_ts_concat, + step=self._train_step_counter, + # Single output since we have concatenated images along batch. + max_outputs=1) + + if img_tf_ts_concat.shape[1] == image.shape[ + 1] and img_tf_ts_concat.shape[2] == image.shape[2]: + # can overlay + overlay_viz = tf.cast( + (tf.cast(image, tf.float32) * (0.2 + img_tf_ts_concat) / 1.2), + tf.uint8) + tf.summary.image( + 'overlay_attention/layer_{}/head_{}'.format(l_idx, head_idx), + overlay_viz, + step=self._train_step_counter, + # Single output since we have concatenated images along batch. + max_outputs=1) + + # log action info + action_labels = tf.boolean_mask(logging_info['action_labels'], + logging_info['actor_loss_mask']) + action_predictions = tf.boolean_mask(logging_info['action_predictions'], + logging_info['actor_loss_mask']) + with tf.name_scope('ActionTokens'): + token_accuracy = ( + tf.cast(tf.equal(action_labels, action_predictions), tf.float32)) + accuracy = tf.reduce_mean(token_accuracy) + tf.compat.v2.summary.scalar( + name='accuracy', data=accuracy, step=self._train_step_counter) + # Accuracy across timesteps + for t in range(self._time_sequence_length): + tf.compat.v2.summary.scalar( + name='accuracy/time_step/{}'.format(t), + data=tf.reduce_mean(token_accuracy[:, t, :]), + step=self._train_step_counter) + token_index = 0 + for k in self._action_tokenizer.action_order: + spec = self._action_tokenizer.action_spec[k] + if spec.dtype == tf.int32: + n_tokens = 1 + else: + n_tokens = spec.shape[0] + action_token_accuracy = tf.reduce_mean( + token_accuracy[:, :, token_index:token_index + n_tokens]) + tf.compat.v2.summary.scalar( + name='accuracy/action_type/{}'.format(k), + data=action_token_accuracy, + step=self._train_step_counter) + for n in range(n_tokens): + tf.summary.histogram( + 'tokens/{}_{}/labels'.format(k, n + 1), + action_labels[:, :, token_index], + step=self._train_step_counter) + tf.summary.histogram( + 'tokens/{}_{}/predictions'.format(k, n + 1), + action_predictions[:, :, token_index], + step=self._train_step_counter) + token_index += 1 + + # log loss components + with tf.name_scope('TokenLosses'): + tf.compat.v2.summary.scalar( + name='action_loss', + data=tf.reduce_mean(logging_info['action_loss']), + step=self._train_step_counter) + + def _tokenize_images(self, observations, network_state, training): + image = observations['image'] # [b, t, h, w, c] + outer_rank = self._get_outer_rank(observations) + if outer_rank == 1: # This is an inference call + seq_idx = tf.reshape(network_state['seq_idx'], [1])[0] + time_step = tf.minimum(seq_idx, self._time_sequence_length - 1) + image = tf.expand_dims(image, 1) + + # TODO(b/255731285) + image_shape = tf.shape(image) + b = image_shape[0] + input_t = image_shape[1] + h = image_shape[2] + w = image_shape[3] + c = image_shape[4] + + context = self._extract_context_from_observation(observations, input_t) + + image = tf.reshape(image, [b * input_t, h, w, c]) + seed = tf.random.uniform(shape=(2,), maxval=2**30, dtype=tf.int32) + image = preprocessors.convert_dtype_and_crop_images( + image, + crop_size=self._crop_size, + training=training, + pad_then_crop=True, + convert_dtype=True, + seed=seed) + image = tf.reshape(image, [b, input_t, h, w, c]) + context_image_tokens = self._image_tokenizer( + image, context=context, training=training) + num_tokens = tf.shape(context_image_tokens)[2] + context_image_tokens = tf.reshape(context_image_tokens, + [b, input_t, num_tokens, 1, -1]) + if outer_rank == 1: # This is an inference call + network_state['context_image_tokens'] = tf.reshape( + network_state['context_image_tokens'], [ + b, self._time_sequence_length, self._tokens_per_context_image, 1, + -1 + ]) + state_image_tokens = network_state['context_image_tokens'] + # network_state as input for this call is the output from the last call. + # Therefore, we need to shift all images to the left by 1 in the time axis + # to align w/ the time dim in this call. + state_image_tokens = tf.cond( + seq_idx == self._time_sequence_length, + lambda: tf.roll(state_image_tokens, -1, axis=1), + lambda: state_image_tokens) + + context_image_tokens = tf.concat([ + state_image_tokens[:, :time_step, ...], context_image_tokens, + state_image_tokens[:, time_step + 1:, ...] + ], + axis=1) + network_state['context_image_tokens'] = context_image_tokens + + return context_image_tokens, network_state + + def _tokenize_actions(self, observations, network_state): + outer_rank = self._get_outer_rank(observations) + if outer_rank == 1: # This is an inference call + # TODO(b/231896343): Clarify what is going on with the network state + # tensors, currently they all have to be the same n_dims so we have to + # add/remove dummy dims. + action_tokens = tf.squeeze(network_state['action_tokens'], [3, 4]) + seq_idx = tf.reshape(network_state['seq_idx'], [1])[0] + # network_state as input for this call is the output from the last call. + # Therefore, we need to shift all actions by 1 to the left. + action_tokens = tf.cond(seq_idx == self._time_sequence_length, + lambda: tf.roll(action_tokens, -1, axis=1), + lambda: action_tokens) + else: + assert outer_rank == 2 + if self._actions is None: + b, t = self._get_batch_size_and_seq_len(network_state) + action_tokens = tf.zeros( + shape=[b, t, self._tokens_per_action], dtype=tf.int32) + else: + action_tokens = self._action_tokenizer.tokenize(self._actions) + return action_tokens + + def _assemble_input_token_sequence(self, context_image_tokens, action_tokens, + batch_size): + # embed action tokens + action_tokens = tf.one_hot(action_tokens, self._vocab_size) + action_tokens = self._action_token_emb(action_tokens) + action_tokens = tf.zeros_like(action_tokens) # b/260260205 + + # Because of b/254902773, we need to add 1 extra dimension. + action_tokens = tf.expand_dims(action_tokens, axis=-2) + + # assemble token sequence + input_token_sequence = tf.concat([context_image_tokens, action_tokens], + axis=2) + + input_token_sequence = tf.reshape( + input_token_sequence, [batch_size, -1, self._token_embedding_size]) + return input_token_sequence + + def _extract_context_from_observation(self, observations, seq_len): + """Extract context from observation.""" + context = None + if 'natural_language_embedding' in observations: + outer_rank = self._get_outer_rank(observations) + context = observations['natural_language_embedding'] # [b, t, emb-size] + if outer_rank == 1: + context = tf.tile(context[:, None], [1, seq_len, 1]) + return context + + def set_actions(self, actions: tensorspec_utils.TensorSpecStruct): + """Sets actions that will be tokenized and used in transformer network. + + Args: + actions: actions to be tokenized and used in transformer network. example + actions are terminate = [0, 1] world_vector = [0.9, 0.8, -0.3] + rotation_delta = [-0.1, 0.2, .6] gripper_closedness = 0.9 + """ + self._actions = actions + + def _get_outer_rank(self, observations): + # used to determine training vs inference call + # outer_rank will be 2 -> [b, t] during training and + # outer_rank will be 1 -> [b] during inference + return nest_utils.get_outer_rank(observations, self._input_tensor_spec) + + def _get_batch_size_and_seq_len(self, network_state): + image_shape = tf.shape(network_state['context_image_tokens']) + b = image_shape[0] + t = image_shape[1] + return b, t + + def get_actor_loss(self) -> tf.Tensor: + return self._loss + + def get_aux_info(self) -> dict[str, Any]: + return self._aux_info diff --git a/transformer_network_test.py b/transformer_network_test.py new file mode 100644 index 0000000..e584ce0 --- /dev/null +++ b/transformer_network_test.py @@ -0,0 +1,229 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for networks.""" + +from absl.testing import parameterized + +from robotics_transformer import transformer_network +from robotics_transformer.transformer_network_test_set_up import BATCH_SIZE +from robotics_transformer.transformer_network_test_set_up import NAME_TO_INF_OBSERVATIONS +from robotics_transformer.transformer_network_test_set_up import NAME_TO_STATE_SPECS +from robotics_transformer.transformer_network_test_set_up import observations_list +from robotics_transformer.transformer_network_test_set_up import spec_names_list +from robotics_transformer.transformer_network_test_set_up import state_spec_list +from robotics_transformer.transformer_network_test_set_up import TIME_SEQUENCE_LENGTH +from robotics_transformer.transformer_network_test_set_up import TransformerNetworkTestUtils + +import tensorflow as tf +from tf_agents.specs import tensor_spec + + +class TransformerNetworkTest(TransformerNetworkTestUtils): + + # pylint:disable=g-complex-comprehension + @parameterized.named_parameters([{ + 'testcase_name': '_' + name, + 'state_spec': spec, + 'train_observation': obs, + } for (name, spec, + obs) in zip(spec_names_list(), state_spec_list(), observations_list())] + ) + # pylint:enable=g-complex-comprehension + def testTransformerTrainLossCall(self, state_spec, train_observation): + network = transformer_network.TransformerNetwork( + input_tensor_spec=state_spec, + output_tensor_spec=self._action_spec, + time_sequence_length=TIME_SEQUENCE_LENGTH) + + network.create_variables() + self.assertNotEmpty(network.variables) + + network.set_actions(self._train_action) + network_state = tensor_spec.sample_spec_nest( + network.state_spec, outer_dims=[BATCH_SIZE]) + output_actions, network_state = network( + train_observation, step_type=None, network_state=network_state) + expected_shape = [2, 3] + self.assertEqual(network.get_actor_loss().shape, + tf.TensorShape(expected_shape)) + self.assertCountEqual(self._train_action.keys(), output_actions.keys()) + + # pylint:disable=g-complex-comprehension + @parameterized.named_parameters([{ + 'testcase_name': '_' + name, + 'spec_name': name, + } for name in spec_names_list()]) + # pylint:enable=g-complex-comprehension + def testTransformerInferenceLossCall(self, spec_name): + state_spec = NAME_TO_STATE_SPECS[spec_name] + observation = NAME_TO_INF_OBSERVATIONS[spec_name] + + network = transformer_network.TransformerNetwork( + input_tensor_spec=state_spec, + output_tensor_spec=self._action_spec, + time_sequence_length=TIME_SEQUENCE_LENGTH, + action_order=[ + 'terminate_episode', 'world_vector', 'rotation_delta', + 'gripper_closedness_action' + ]) + network.create_variables() + self.assertNotEmpty(network.variables) + + network.set_actions(self._inference_action) + # inference currently only support batch size of 1 + network_state = tensor_spec.sample_spec_nest( + network.state_spec, outer_dims=[1]) + + output_actions, network_state = network( + observation, step_type=None, network_state=network_state) + + tf.debugging.assert_equal(network.get_actor_loss(), 0.0) + self.assertCountEqual(self._inference_action.keys(), output_actions.keys()) + + # pylint:disable=g-complex-comprehension + @parameterized.named_parameters([{ + 'testcase_name': '_' + name, + 'state_spec': spec, + 'train_observation': obs, + } for name, spec, obs in zip(spec_names_list(), state_spec_list(), + observations_list())]) + # pylint:enable=g-complex-comprehension + def testTransformerLogging(self, state_spec, train_observation): + network = transformer_network.TransformerNetwork( + input_tensor_spec=state_spec, + output_tensor_spec=self._action_spec, + time_sequence_length=TIME_SEQUENCE_LENGTH, + action_order=[ + 'terminate_episode', 'world_vector', 'rotation_delta', + 'gripper_closedness_action' + ]) + + network.create_variables() + self.assertNotEmpty(network.variables) + + network.set_actions(self._train_action) + network_state = tensor_spec.sample_spec_nest( + network.state_spec, outer_dims=[BATCH_SIZE]) + _ = network(train_observation, step_type=None, network_state=network_state) + network.add_summaries( + train_observation, + network.get_aux_info(), + debug_summaries=True, + training=True) + + # pylint:disable=g-complex-comprehension + @parameterized.named_parameters([{ + 'testcase_name': '_' + name, + 'state_spec': spec, + } for name, spec in zip(spec_names_list(), state_spec_list())]) + # pylint:enable=g-complex-comprehension + def testTransformerCausality(self, state_spec): + """Tests the causality for the transformer. + + Args: + state_spec: Which state spec to test the transformer with + """ + network = transformer_network.TransformerNetwork( + input_tensor_spec=state_spec, + output_tensor_spec=self._action_spec, + time_sequence_length=TIME_SEQUENCE_LENGTH) + network.create_variables() + self.assertNotEmpty(network.variables) + + time_sequence_length = network._time_sequence_length + tokens_per_image = network._tokens_per_context_image + tokens_per_action = network._tokens_per_action + + def _split_image_and_action_tokens(all_tokens): + image_start_indices = [(tokens_per_image + tokens_per_action) * k + for k in range(time_sequence_length)] + image_tokens = tf.stack( + [all_tokens[i:i + tokens_per_image] for i in image_start_indices], + axis=0) + action_start_indices = [i + tokens_per_image for i in image_start_indices] + action_tokens = [ + tf.stack([ + all_tokens[i:i + tokens_per_action] for i in action_start_indices + ], 0) + ] + image_tokens = tf.one_hot(image_tokens, network._token_embedding_size) + # Remove extra dimension before the end once b/254902773 is fixed. + shape = image_tokens.shape + # Add batch dimension. + image_tokens = tf.reshape(image_tokens, + [1] + shape[:-1] + [1] + shape[-1:]) + return image_tokens, action_tokens + + # Generate some random tokens for image and actions. + all_tokens = tf.random.uniform( + shape=[time_sequence_length * (tokens_per_image + tokens_per_action)], + dtype=tf.int32, + maxval=10, + minval=0) + context_image_tokens, action_tokens = _split_image_and_action_tokens( + all_tokens) + # Get the output tokens without any zeroed out input tokens. + output_tokens = network._transformer_call( + context_image_tokens=context_image_tokens, + action_tokens=action_tokens, + attention_mask=network._default_attention_mask, + batch_size=1, + training=False)[0] + + for t in range(time_sequence_length * + (tokens_per_image + tokens_per_action)): + # Zero out future input tokens. + all_tokens_at_t = tf.concat( + [all_tokens[:t + 1], + tf.zeros_like(all_tokens[t + 1:])], 0) + context_image_tokens, action_tokens = _split_image_and_action_tokens( + all_tokens_at_t) + # Get the output tokens with zeroed out input tokens after t. + output_tokens_at_t = network._transformer_call( + context_image_tokens=context_image_tokens, + action_tokens=action_tokens, + attention_mask=network._default_attention_mask, + batch_size=1, + training=False)[0] + # The output token is unchanged if future input tokens are zeroed out. + self.assertAllEqual(output_tokens[:t + 1], output_tokens_at_t[:t + 1]) + + def testLossMasks(self): + self._define_specs() + self._create_agent() + image_tokens = 3 + action_tokens = 2 + self._agent._actor_network._time_sequence_length = 2 + self._agent._actor_network._tokens_per_context_image = image_tokens + self._agent._actor_network._tokens_per_action = action_tokens + self._agent._actor_network._generate_masks() + self.assertAllEqual( + self._agent._actor_network._action_tokens_mask, + tf.constant([ + image_tokens, image_tokens + 1, 2 * image_tokens + action_tokens, + 2 * image_tokens + action_tokens + 1 + ], tf.int32)) + self._agent._actor_network._generate_masks() + self.assertAllEqual( + self._agent._actor_network._action_tokens_mask, + tf.constant([ + image_tokens, image_tokens + 1, 2 * (image_tokens) + action_tokens, + 2 * (image_tokens) + action_tokens + 1 + ], tf.int32)) + + +if __name__ == '__main__': + # Useful to enable if running with ipdb. + tf.config.run_functions_eagerly(True) + tf.test.main() diff --git a/transformer_network_test_set_up.py b/transformer_network_test_set_up.py new file mode 100644 index 0000000..f51d08f --- /dev/null +++ b/transformer_network_test_set_up.py @@ -0,0 +1,391 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for networks.""" + +import copy +from typing import Optional, Tuple, Union + +from absl.testing import parameterized +import numpy as np +from robotics_transformer import sequence_agent +from robotics_transformer import transformer_network +from tensor2robot.utils import tensorspec_utils +import tensorflow as tf +from tf_agents.specs import tensor_spec +from tf_agents.trajectories import time_step as ts + +BATCH_SIZE = 2 +TIME_SEQUENCE_LENGTH = 3 +HEIGHT = 256 +WIDTH = 320 +NUM_IMAGE_TOKENS = 2 + + +def spec_names_list() -> list[str]: + """Lists the different types of specs accepted by the transformer.""" + return ['default'] + + +def state_spec_list() -> list[tensorspec_utils.TensorSpecStruct]: + """Lists the different types of state spec accepted by the transformer.""" + state_spec = tensorspec_utils.TensorSpecStruct() + state_spec.image = tensor_spec.BoundedTensorSpec([HEIGHT, WIDTH, 3], + dtype=tf.float32, + name='image', + minimum=0., + maximum=1.) + state_spec.natural_language_embedding = tensor_spec.TensorSpec( + shape=[512], dtype=tf.float32, name='natural_language_embedding') + + state_spec_mask = copy.deepcopy(state_spec) + state_spec_mask.initial_binary_mask = tensor_spec.BoundedTensorSpec( + [HEIGHT, WIDTH, 1], + dtype=tf.int32, + name='initial_binary_mask', + minimum=0, + maximum=255) + + state_spec_tcl = copy.deepcopy(state_spec) + state_spec_tcl.original_image = tensor_spec.BoundedTensorSpec( + [HEIGHT, WIDTH, 3], + dtype=tf.float32, + name='original_image', + minimum=0., + maximum=1.) + + return [ + state_spec, + state_spec_mask, + state_spec_tcl, + ] + + +def observations_list(training: bool = True) -> list[dict[str, tf.Tensor]]: + """Lists the different types of observations accepted by the transformer.""" + if training: + image_shape = [BATCH_SIZE, TIME_SEQUENCE_LENGTH, HEIGHT, WIDTH, 3] + emb_shape = [BATCH_SIZE, TIME_SEQUENCE_LENGTH, 512] + mask_shape = [BATCH_SIZE, TIME_SEQUENCE_LENGTH, HEIGHT, WIDTH, 1] + else: + # inference currently only support batch size of 1 + image_shape = [1, HEIGHT, WIDTH, 3] + emb_shape = [1, 512] + mask_shape = [1, HEIGHT, WIDTH, 1] + return [ + { + 'image': tf.constant(0.5, shape=image_shape), + 'natural_language_embedding': tf.constant(1., shape=emb_shape), + }, + { + 'image': tf.constant(0.5, shape=image_shape), + 'natural_language_embedding': tf.constant(1., shape=emb_shape), + 'initial_binary_mask': tf.constant(192, shape=mask_shape), + }, + { # This is used for TCL. + 'image': tf.constant(0.5, shape=image_shape), + 'original_image': tf.constant(0.4, shape=image_shape), + 'natural_language_embedding': tf.constant(1., shape=emb_shape), + }, + ] + + +NAME_TO_STATE_SPECS = dict(zip(spec_names_list(), state_spec_list())) +NAME_TO_OBSERVATIONS = dict(zip(spec_names_list(), observations_list())) +NAME_TO_INF_OBSERVATIONS = dict( + zip(spec_names_list(), observations_list(False))) + + +class FakeImageTokenizer(tf.keras.layers.Layer): + """Fake Image Tokenizer for testing Transformer.""" + + def __init__(self, + encoder: ..., + position_embedding: ..., + embedding_output_dim: int, + patch_size: int, + use_token_learner: bool = False, + num_tokens: int = NUM_IMAGE_TOKENS, + use_initial_binary_mask: bool = False, + **kwargs): + del encoder, position_embedding, patch_size, use_token_learner + super().__init__(**kwargs) + self.tokens_per_context_image = num_tokens + if use_initial_binary_mask: + self.tokens_per_context_image += 1 + self.embedding_output_dim = embedding_output_dim + self.use_initial_binary_mask = use_initial_binary_mask + + def __call__(self, + image: tf.Tensor, + context: Optional[tf.Tensor] = None, + initial_binary_mask: Optional[tf.Tensor] = None, + training: bool = False) -> tf.Tensor: + if self.use_initial_binary_mask: + assert initial_binary_mask is not None + image_shape = tf.shape(image) + seq_size = image_shape[1] + batch_size = image_shape[0] + all_tokens = [] + num_tokens = self.tokens_per_context_image + for t in range(seq_size): + tokens = tf.ones([batch_size, 1, num_tokens, self.embedding_output_dim + ]) * image[0][t][0][0] + all_tokens.append(tokens) + return tf.concat(all_tokens, axis=1) + + +class TransformerNetworkTestUtils(tf.test.TestCase, parameterized.TestCase): + """Defines specs, SequenceAgent, and various other testing utilities.""" + + def _define_specs(self, + train_batch_size=BATCH_SIZE, + inference_batch_size=1, + time_sequence_length=TIME_SEQUENCE_LENGTH, + inference_sequence_length=TIME_SEQUENCE_LENGTH, + token_embedding_size=512, + image_width=WIDTH, + image_height=HEIGHT): + """Defines specs and observations (both training and inference).""" + self.train_batch_size = train_batch_size + self.inference_batch_size = inference_batch_size + self.time_sequence_length = time_sequence_length + self.inference_sequence_length = inference_sequence_length + self.token_embedding_size = token_embedding_size + action_spec = tensorspec_utils.TensorSpecStruct() + action_spec.world_vector = tensor_spec.BoundedTensorSpec( + (3,), dtype=tf.float32, minimum=-1., maximum=1., name='world_vector') + + action_spec.rotation_delta = tensor_spec.BoundedTensorSpec( + (3,), + dtype=tf.float32, + minimum=-np.pi / 2, + maximum=np.pi / 2, + name='rotation_delta') + + action_spec.gripper_closedness_action = tensor_spec.BoundedTensorSpec( + (1,), + dtype=tf.float32, + minimum=-1., + maximum=1., + name='gripper_closedness_action') + action_spec.terminate_episode = tensor_spec.BoundedTensorSpec( + (2,), dtype=tf.int32, minimum=0, maximum=1, name='terminate_episode') + + state_spec = tensorspec_utils.TensorSpecStruct() + state_spec.image = tensor_spec.BoundedTensorSpec( + [image_height, image_width, 3], + dtype=tf.float32, + name='image', + minimum=0., + maximum=1.) + state_spec.natural_language_embedding = tensor_spec.TensorSpec( + shape=[self.token_embedding_size], + dtype=tf.float32, + name='natural_language_embedding') + self._policy_info_spec = { + 'return': + tensor_spec.BoundedTensorSpec((), + dtype=tf.float32, + minimum=0.0, + maximum=1.0, + name='return'), + 'discounted_return': + tensor_spec.BoundedTensorSpec((), + dtype=tf.float32, + minimum=0.0, + maximum=1.0, + name='discounted_return'), + } + + self._state_spec = state_spec + self._action_spec = action_spec + + self._inference_observation = { + 'image': + tf.constant( + 1, + shape=[self.inference_batch_size, image_height, image_width, 3], + dtype=tf.dtypes.float32), + 'natural_language_embedding': + tf.constant( + 1., + shape=[self.inference_batch_size, self.token_embedding_size], + dtype=tf.dtypes.float32), + } + self._train_observation = { + 'image': + tf.constant( + 0.5, + shape=[ + self.train_batch_size, self.time_sequence_length, + image_height, image_width, 3 + ]), + 'natural_language_embedding': + tf.constant( + 1., + shape=[ + self.train_batch_size, self.time_sequence_length, + self.token_embedding_size + ]), + } + self._inference_action = { + 'world_vector': + tf.constant(0.5, shape=[self.inference_batch_size, 3]), + 'rotation_delta': + tf.constant(0.5, shape=[self.inference_batch_size, 3]), + 'terminate_episode': + tf.constant( + [0, 1] * self.inference_batch_size, + shape=[self.inference_batch_size, 2]), + 'gripper_closedness_action': + tf.constant(0.5, shape=[self.inference_batch_size, 1]), + } + self._train_action = { + 'world_vector': + tf.constant( + 0.5, + shape=[self.train_batch_size, self.time_sequence_length, 3]), + 'rotation_delta': + tf.constant( + 0.5, + shape=[self.train_batch_size, self.time_sequence_length, 3]), + 'terminate_episode': + tf.constant( + [0, 1] * self.train_batch_size * self.time_sequence_length, + shape=[self.train_batch_size, self.time_sequence_length, 2]), + 'gripper_closedness_action': + tf.constant( + 0.5, + shape=[self.train_batch_size, self.time_sequence_length, 1]), + } + + def _create_agent(self, actor_network=None): + """Creates SequenceAgent using custom actor_network.""" + time_step_spec = ts.time_step_spec(observation_spec=self._state_spec) + if actor_network is None: + actor_network = transformer_network.TransformerNetwork + + self._agent = sequence_agent.SequenceAgent( + time_step_spec=time_step_spec, + action_spec=self._action_spec, + actor_network=actor_network, + actor_optimizer=tf.keras.optimizers.Adam(), + train_step_counter=tf.compat.v1.train.get_or_create_global_step(), + time_sequence_length=TIME_SEQUENCE_LENGTH) + self._num_action_tokens = ( + # pylint:disable=protected-access + self._agent._actor_network._action_tokenizer._tokens_per_action) + # pylint:enable=protected-access + + def setUp(self): + self._define_specs() + super().setUp() + + def get_image_value(self, step_idx: int) -> float: + return float(step_idx) / self.time_sequence_length + + def get_action_logits(self, batch_size: int, value: int, + vocab_size: int) -> tf.Tensor: + return tf.broadcast_to( + tf.one_hot(value % vocab_size, vocab_size)[tf.newaxis, tf.newaxis, :], + [batch_size, 1, vocab_size]) + + def create_obs(self, value) -> dict[str, tf.Tensor]: + observations = {} + observations['image'] = value * self._inference_observation['image'] + observations[ + 'natural_language_embedding'] = value * self._inference_observation[ + 'natural_language_embedding'] + return observations + + def fake_action_token_emb(self, action_tokens) -> tf.Tensor: + """Just pad with zeros.""" + shape = action_tokens.shape + assert self.vocab_size > self.token_embedding_size + assert len(shape) == 4 + return action_tokens[:, :, :, :self.token_embedding_size] + + def fake_transformer( + self, all_tokens, training, + attention_mask) -> Union[tf.Tensor, Tuple[tf.Tensor, list[tf.Tensor]]]: + """Fakes the call to TransformerNetwork._transformer.""" + del training + del attention_mask + # We expect ST00 ST01 A00 A01... + # Where: + # * ST01 is token 1 of state 0. + # * A01 is token 1 of action 0. + shape = all_tokens.shape.as_list() + batch_size = shape[0] + self.assertEqual(batch_size, 1) + emb_size = self.token_embedding_size + + # transform to [batch_size, num_tokens, token_size] + all_tokens = tf.reshape(all_tokens, [batch_size, -1, emb_size]) + # Pads tokens to be of vocab_size. + self.assertGreater(self.vocab_size, self.token_embedding_size) + all_shape = all_tokens.shape + self.assertLen(all_shape.as_list(), 3) + output_tokens = tf.concat([ + all_tokens, + tf.zeros([ + all_shape[0], all_shape[1], + self.vocab_size - self.token_embedding_size + ]) + ], + axis=-1) + num_tokens_per_step = NUM_IMAGE_TOKENS + self._num_action_tokens + # Check state/action alignment. + window_range = min(self._step_idx + 1, self.time_sequence_length) + for j in range(window_range): + # The index step that is stored in j = 0. + first_step_idx = max(0, self._step_idx + 1 - self.time_sequence_length) + image_idx = j * num_tokens_per_step + action_start_index = image_idx + NUM_IMAGE_TOKENS + for t in range(NUM_IMAGE_TOKENS): + self.assertAllEqual( + self.get_image_value(first_step_idx + j) * + tf.ones_like(all_tokens[0][image_idx][:self.token_embedding_size]), + all_tokens[0][image_idx + t][:self.token_embedding_size]) + # if j is not the current step in the window, all action dimensions + # from previous steps are already infered and thus can be checked. + action_dims_range = self.action_inf_idx if j == window_range - 1 else self._num_action_tokens + for t in range(action_dims_range): + token_idx = action_start_index + t + action_value = (first_step_idx + j) * self._num_action_tokens + t + self.assertAllEqual( + self.get_action_logits( + batch_size=batch_size, + value=action_value, + vocab_size=self.vocab_size)[0][0][:self.token_embedding_size], + all_tokens[0][token_idx][:self.token_embedding_size]) + # Output the right action dimension value. + image_token_index = ( + min(self._step_idx, self.time_sequence_length - 1) * + num_tokens_per_step) + transformer_shift = -1 + action_index = ( + image_token_index + NUM_IMAGE_TOKENS + self.action_inf_idx + + transformer_shift) + action_value = self._step_idx * self._num_action_tokens + self.action_inf_idx + action_logits = self.get_action_logits( + batch_size=batch_size, value=action_value, vocab_size=self.vocab_size) + output_tokens = tf.concat([ + output_tokens[:, :action_index, :], action_logits[:, :, :], + output_tokens[:, action_index + 1:, :] + ], + axis=1) + self.action_inf_idx = (self.action_inf_idx + 1) % self._num_action_tokens + attention_scores = [] + return output_tokens, attention_scores diff --git a/transformer_test.py b/transformer_test.py new file mode 100644 index 0000000..4d3aa0f --- /dev/null +++ b/transformer_test.py @@ -0,0 +1,55 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for transformer.""" +from absl.testing import parameterized +from robotics_transformer import transformer +import tensorflow as tf + + +class TransformerTest(parameterized.TestCase): + + def setUp(self): + self._vocab_size = 10 + batch_size = 8 + sequence_len = 12 + self._tokens = tf.random.uniform( + [batch_size, sequence_len, self._vocab_size], + minval=0, + maxval=1, + dtype=tf.dtypes.float32, + ) + super(TransformerTest, self).setUp() + + @parameterized.parameters(True, False) + def test_transformer_forwardpass(self, return_attention_scores): + network = transformer.Transformer( + num_layers=2, + layer_size=512, + num_heads=4, + feed_forward_size=256, + dropout_rate=0.1, + vocab_size=self._vocab_size, + return_attention_scores=return_attention_scores) + + output_tokens, attention_scores = network(self._tokens, attention_mask=None) + self.assertSequenceEqual(self._tokens.shape.as_list(), + output_tokens.shape.as_list()) + if return_attention_scores: + self.assertNotEmpty(attention_scores) + else: + self.assertEmpty(attention_scores) + + +if __name__ == '__main__': + tf.test.main()