add robotics transformer

2022-12-09 11:58:47 -08:00 · 2022-12-09 11:58:47 -08:00 · ccd9d2fe2d
commit ccd9d2fe2d
41 changed files with 4946 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,25 @@
+# Compiled python modules.
+*.pyc
+
+# Byte-compiled
+_pycache__/
+.cache/
+
+# Poetry, setuptools, PyPI distribution artifacts.
+/*.egg-info
+.eggs/
+build/
+dist/
+poetry.lock
+
+# Tests
+.pytest_cache/
+
+# Type checking
+.pytype/
+
+# Other
+*.DS_Store
+
+# PyCharm
+.idea
--- a/.pylintrc
+++ b/.pylintrc
@ -0,0 +1,447 @@
+# This Pylint rcfile contains a best-effort configuration to uphold the
+# best-practices and style described in the Google Python style guide:
+#   https://google.github.io/styleguide/pyguide.html
+#
+# Its canonical open-source location is:
+#   https://google.github.io/styleguide/pylintrc
+
+[MASTER]
+
+# Add files or directories to the ignore list. They should be base names, not
+# paths.
+ignore=third_party
+
+# Add files or directories matching the regex patterns to the ignore list. The
+# regex matches against base names, not paths.
+ignore-patterns=
+
+# Pickle collected data for later comparisons.
+persistent=no
+
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Use multiple processes to speed up Pylint.
+jobs=4
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code.
+extension-pkg-allow-list=
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
+confidence=
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+#enable=
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once).You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use"--disable=all --enable=classes
+# --disable=W"
+disable=abstract-method,
+        apply-builtin,
+        arguments-differ,
+        attribute-defined-outside-init,
+        backtick,
+        bad-option-value,
+        basestring-builtin,
+        buffer-builtin,
+        c-extension-no-member,
+        consider-using-enumerate,
+        cmp-builtin,
+        cmp-method,
+        coerce-builtin,
+        coerce-method,
+        delslice-method,
+        div-method,
+        duplicate-code,
+        eq-without-hash,
+        execfile-builtin,
+        file-builtin,
+        filter-builtin-not-iterating,
+        fixme,
+        getslice-method,
+        global-statement,
+        hex-method,
+        idiv-method,
+        implicit-str-concat-in-sequence,
+        import-error,
+        import-self,
+        import-star-module-level,
+        inconsistent-return-statements,
+        input-builtin,
+        intern-builtin,
+        invalid-str-codec,
+        locally-disabled,
+        long-builtin,
+        long-suffix,
+        map-builtin-not-iterating,
+        misplaced-comparison-constant,
+        missing-function-docstring,
+        metaclass-assignment,
+        next-method-called,
+        next-method-defined,
+        no-absolute-import,
+        no-else-break,
+        no-else-continue,
+        no-else-raise,
+        no-else-return,
+        no-init,  # added
+        no-member,
+        no-name-in-module,
+        no-self-use,
+        nonzero-method,
+        oct-method,
+        old-division,
+        old-ne-operator,
+        old-octal-literal,
+        old-raise-syntax,
+        parameter-unpacking,
+        print-statement,
+        raising-string,
+        range-builtin-not-iterating,
+        raw_input-builtin,
+        rdiv-method,
+        reduce-builtin,
+        relative-import,
+        reload-builtin,
+        round-builtin,
+        setslice-method,
+        signature-differs,
+        standarderror-builtin,
+        suppressed-message,
+        sys-max-int,
+        too-few-public-methods,
+        too-many-ancestors,
+        too-many-arguments,
+        too-many-boolean-expressions,
+        too-many-branches,
+        too-many-instance-attributes,
+        too-many-locals,
+        too-many-nested-blocks,
+        too-many-public-methods,
+        too-many-return-statements,
+        too-many-statements,
+        trailing-newlines,
+        unichr-builtin,
+        unicode-builtin,
+        unnecessary-pass,
+        unpacking-in-except,
+        useless-else-on-loop,
+        useless-object-inheritance,
+        useless-suppression,
+        using-cmp-argument,
+        wrong-import-order,
+        xrange-builtin,
+        zip-builtin-not-iterating,
+
+
+[REPORTS]
+
+# Set the output format. Available formats are text, parseable, colorized, msvs
+# (visual studio) and html. You can also give a reporter class, eg
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Put messages in a separate file for each module / package specified on the
+# command line instead of printing them on stdout. Reports (if any) will be
+# written in a file name "pylint_global.[txt|html]". This option is deprecated
+# and it will be removed in Pylint 2.0.
+files-output=no
+
+# Tells whether to display a full report or only the messages
+reports=no
+
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details
+#msg-template=
+
+
+[BASIC]
+
+# Good variable names which should always be accepted, separated by a comma
+good-names=main,_
+
+# Bad variable names which should always be refused, separated by a comma
+bad-names=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Include a hint for the correct naming format with invalid-name
+include-naming-hint=no
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl
+
+# Regular expression matching correct function names
+function-rgx=^(?:(?P<exempt>setUp|tearDown|setUpModule|tearDownModule)|(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
+
+# Regular expression matching correct variable names
+variable-rgx=^[a-z][a-z0-9_]*$
+
+# Regular expression matching correct constant names
+const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
+
+# Regular expression matching correct attribute names
+attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
+
+# Regular expression matching correct argument names
+argument-rgx=^[a-z][a-z0-9_]*$
+
+# Regular expression matching correct class attribute names
+class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
+
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=^[a-z][a-z0-9_]*$
+
+# Regular expression matching correct class names
+class-rgx=^_?[A-Z][a-zA-Z0-9]*$
+
+# Regular expression matching correct module names
+module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$
+
+# Regular expression matching correct method names
+method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=10
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+
+[FORMAT]
+
+# Maximum number of characters on a single line.
+max-line-length=80
+
+# TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt
+# lines made too long by directives to pytype.
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=(?x)(
+  ^\s*(\#\ )?<?https?://\S+>?$|
+  ^\s*(from\s+\S+\s+)?import\s+.+$)
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=yes
+
+# List of optional constructs for which whitespace checking is disabled. `dict-
+# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
+# `trailing-comma` allows a space between comma and closing bracket: (a, ).
+# `empty-line` allows space-only lines.
+no-space-check=
+
+# Maximum number of lines in a module
+max-module-lines=99999
+
+# String used as indentation unit.  The internal Google style guide mandates 2
+# spaces.  Google's externaly-published style guide says 4, consistent with
+# PEP 8.  Here, we use 2 spaces, for conformity with many open-sourced Google
+# projects (like TensorFlow).
+indent-string='  '
+
+# Number of spaces of indent required inside a hanging  or continued line.
+indent-after-paren=4
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=TODO
+
+
+[STRING]
+
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=yes
+
+
+[VARIABLES]
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools
+
+
+[LOGGING]
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format
+logging-modules=logging,absl.logging,tensorflow.io.logging
+
+
+[SIMILARITIES]
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+
+[SPELLING]
+
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package.
+spelling-dict=
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[IMPORTS]
+
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=regsub,
+                   TERMIOS,
+                   Bastion,
+                   rexec,
+                   sets
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant, absl
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+
+[CLASSES]
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+                  _fields,
+                  _replace,
+                  _source,
+                  _make
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls,
+                            class_
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=StandardError,
+                       Exception,
+                       BaseException
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,30 @@
+{
+    "files.insertFinalNewline": true,
+    "files.trimFinalNewlines": true,
+    "files.trimTrailingWhitespace": true,
+    "files.associations": {
+        ".pylintrc": "ini"
+    },
+    "python.testing.unittestEnabled": false,
+    "python.testing.nosetestsEnabled": false,
+    "python.testing.pytestEnabled": true,
+    "python.linting.pylintUseMinimalCheckers": false,
+    "[python]": {
+        "editor.rulers": [
+            80
+        ],
+        "editor.tabSize": 2,
+        "editor.formatOnSave": true,
+        "editor.detectIndentation": false
+    },
+    "python.formatting.provider": "black",
+    "python.formatting.blackPath": "pyink",
+    "files.watcherExclude": {
+        "**/.git/**": true
+    },
+    "files.exclude": {
+        "**/__pycache__": true,
+        "**/.pytest_cache": true,
+        "**/*.egg-info": true
+    }
+}
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,31 @@
+# Changelog
+
+<!--
+
+Changelog follow the https://keepachangelog.com/ standard (at least the headers)
+
+This allow to:
+
+* auto-parsing release notes during the automated releases from github-action:
+  https://github.com/marketplace/actions/pypi-github-auto-release
+* Have clickable headers in the rendered markdown
+
+To release a new version (e.g. from `1.0.0` -> `2.0.0`):
+
+* Create a new `# [2.0.0] - YYYY-MM-DD` header and add the current
+  `[Unreleased]` notes.
+* At the end of the file:
+  * Define the new link url:
+  `[2.0.0]: https://github.com/google-research/robotics_transformer/compare/v1.0.0...v2.0.0`
+  * Update the `[Unreleased]` url: `v1.0.0...HEAD` -> `v2.0.0...HEAD`
+
+-->
+
+## [Unreleased]
+
+## [0.1.0] - 2022-01-01
+
+* Initial release
+
+[Unreleased]: https://github.com/google-research/robotics_transformer/compare/v0.1.0...HEAD
+[0.1.0]: https://github.com/google-research/robotics_transformer/releases/tag/v0.1.0
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,29 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement (CLA). You (or your employer) retain the copyright to your
+contribution; this simply gives us permission to use and redistribute your
+contributions as part of the project. Head over to
+<https://cla.developers.google.com/> to see your current agreements on file or
+to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code Reviews
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google/conduct/).
--- a/202
+++ b/202
@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/134
+++ b/134
@ -0,0 +1,134 @@
+# go/google3metadata
+# proto-file: devtools/metadata/metadata.proto
+# proto-message: MetaData
+
+name: "robotics_transformer"
+description: "code and utilities to build and run RT-1 robotics transformer"
+
+third_party {
+  url {
+    type: HOMEPAGE
+    value: "http://go/robotics_transformer"
+  }
+  url {
+    type: PIPER
+    value: "http://google3/third_party/py/robotics_transformer"
+  }
+}
+
+presubmit: {
+  review_notify: "robotics_transformer-automated+reviews"
+
+  check_tests: {
+    failure_status: ERROR
+    project: "robotics_transformer"
+  }
+
+  # Checks that files in the changelist do not contain tab characters.
+  check_tabs: {
+    failure_status: ERROR
+  }
+
+  check_trailing_whitespace: {
+    failure_status: ERROR
+  }
+
+  # Presubmit applied during submit
+
+  check_lint: {
+    action: SUBMIT  # Do not activate by default to not block TAP.
+    failure_status: ERROR
+  }
+
+  # Ensures that the string "do not submit" (in all caps) is not present.
+  check_do_not_submit: {
+    action: SUBMIT
+  }
+}
+
+# Register the copy.bara.sky
+exported: {
+  copybara: {
+    config_path: "//depot/google3/third_party/py/robotics_transformer/copy.bara.sky"
+  }
+  path_expression: "//depot/google3/third_party/py/robotics_transformer/..."
+  remote_location: "https://github.com/google-research/robotics_transformer"
+  reason: OPEN_SOURCE
+  description: "Open source robotics_transformer"
+  # request_url: "https://launch.corp.google.com/launch/4225970"
+  owning_team_email: "robotics_transformer-automated@google.com"
+}
+
+# Copybara presubmit
+# presubmit: {
+#   path_expression: "//depot/google3/third_party/py/robotics_transformer/..."
+#   # Do not trigger copybara for the following files
+#   path_expression_exclusion: "//depot/.../METADATA"
+#   path_expression_exclusion: "//depot/.../OWNERS"
+#   path_expression_exclusion: "//depot/.../BUILD"
+#   path_expression_exclusion: "//depot/.../*.bzl"
+#   path_expression_exclusion: "//depot/.../google/..."
+
+#   # Ensure that changes contain public notes for git commit messages.
+#   check_description: {
+#     base: {
+#       id: "CopybaraDescription"
+#       disable_tags: "GIT_ORIGIN_REV_ID"
+#       disable_tags: "SKIP_COPYBARA"
+#     }
+
+#     required_regexp:
+#       "("
+#       "(^|\\n)\\s*BEGIN_PUBLIC\\s*?\\n"
+#       "(.*\\n)*"
+#       "\\s*\\S+.*(\\n.*)*\\n"
+#       "\\s*END_PUBLIC\\s*?\\n"
+#       "|"
+#       "(^|\\n)\\s*PUBLIC:(?: )*\\S+"
+#       ")"
+
+#     failure_message:
+#       "\n"
+#       "By running presubmit, this cl will be exported as PR on github. "
+#       "Please add a public commit message to the cl description:\n"
+#       "\n"
+#       "PUBLIC: my public commit msg\n"
+#       "\n"
+#       "OR\n"
+#       "\n"
+#       "BEGIN_PUBLIC\n"
+#       "my public\n"
+#       "commit msg\n"
+#       "END_PUBLIC\n"
+#       "\n"
+#       "If you're certain your change does not produce public changes, the\n"
+#       "message can say 'Internal'.\n"
+#     failure_status: WARNING
+#     required_for_cleanup: false
+#   }
+
+#   check_presubmit_service: {
+#     base: { id: "Copybara-Review" disable_tags: "GIT_ORIGIN_REV_ID" }
+#     action: REVIEW
+#     streaming: true
+#     timeout: 60
+#     failure_status: WARNING
+#     execution_mode: SECONDARY_EXECUTION
+#     include_all_opened_files: true
+#     include_deleted_files: true
+#     address: "blade:copybara-streaming-presubmit-service-prod"
+#     options: "depot_path=//depot/google3/third_party/py/robotics_transformer/copy.bara.sky;workflow=piper_to_github_presubmit;blocking=false"
+#   }
+#   check_presubmit_service: {
+#     base: { id: "Copybara-Submit" disable_tags: "GIT_ORIGIN_REV_ID" }
+#     action: SUBMIT
+#     streaming: true
+#     timeout: 600
+#     failure_status: ERROR
+#     execution_mode: SECONDARY_EXECUTION
+#     include_all_opened_files: true
+#     include_deleted_files: true
+#     address: "blade:copybara-streaming-presubmit-service-prod"
+#     options: "depot_path=//depot/google3/third_party/py/robotics_transformer/copy.bara.sky;workflow=piper_to_github_presubmit;blocking=true"
+#   }
+# }
--- a/2
+++ b/2
@ -0,0 +1,2 @@
+keerthanapg
+yaolug
--- a/README.md
+++ b/README.md
@ -0,0 +1,45 @@
+# Robotics Transformer
+
+*This is not an officially supported Google product.*
+
+
+This repository is a collection code files and artifacts for running
+Robotics Transformer or RT-1.
+
+## Features
+
+* Film efficient net based image tokenizer backbone
+* Token learner based compression of input tokens
+* Transformer for end to end robotic control
+* Testing utilities
+
+## Getting Started
+
+### Installation
+Clone the repo
+```bash
+git clone https://github.com/google-research/robotics_transformer.git
+pip install -r robotics_transformer/requirements.txt
+python -m robotics_transformer.tokenizers.action_tokenizer.test
+```
+
+### Running Tests
+
+To run RT-1 tests, you can clone the git repo and run
+[bazel](https://bazel.build/):
+
+```bash
+git clone https://github.com/google_research/robotics_transformer.git
+cd robotics_transformer
+bazel test ...
+```
+
+## Future Releases
+
+The current repository includes an initial set of libraries for early adoption.
+More components may come in future releases.
+
+## License
+
+The Robotics Transformer library is licensed under the terms of the Apache
+license.
--- a/init.py
+++ b/init.py
@ -0,0 +1,18 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""robotics_transformer API."""
+
+# A new PyPI release will be pushed everytime `__version__` is increased
+# When changing this, also update the CHANGELOG.md
+__version__ = '0.1.0'
--- a/configs/transformer_mixin.gin
+++ b/configs/transformer_mixin.gin
@ -0,0 +1,27 @@
+from __gin__ import dynamic_registration
+from robotics_transformer import transformer_network
+from robotics_transformer.tokenizers import image_tokenizer
+import tensorflow as tf
+
+LEARNING_RATE_ACTOR = 0.0001
+SEQUENCE_LENGTH = 6
+
+
+transformer_network.TransformerNetwork:
+  num_layers = 8
+  layer_size = 128
+  num_heads = 8
+  feed_forward_size = 512
+  dropout_rate = 0.1
+  vocab_size = 256
+  token_embedding_size = 512
+  time_sequence_length = %SEQUENCE_LENGTH
+  crop_size = %CROP_SIZE
+  action_order = %ACTION_ORDER
+  use_token_learner = True
+
+actor_optimizer/tf.keras.optimizers.Adam:
+  learning_rate = %LEARNING_RATE_ACTOR
+
+ACTOR_NETWORK = @transformer_network.TransformerNetwork
+ACTOR_OPTIMIZER = @actor_optimizer/tf.keras.optimizers.Adam()
--- a/film_efficientnet/init.py
+++ b/film_efficientnet/init.py
@ -0,0 +1,13 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/film_efficientnet/efficientnet_checkpoints/efficientnetb3.h5
+++ b/film_efficientnet/efficientnet_checkpoints/efficientnetb3.h5
--- a/film_efficientnet/efficientnet_checkpoints/efficientnetb3_notop.h5
+++ b/film_efficientnet/efficientnet_checkpoints/efficientnetb3_notop.h5
--- a/film_efficientnet/efficientnet_checkpoints/imagenet_classes.json
+++ b/film_efficientnet/efficientnet_checkpoints/imagenet_classes.json
--- a/film_efficientnet/film_conditioning_layer.py
+++ b/film_efficientnet/film_conditioning_layer.py
@ -0,0 +1,74 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ResNet variants model for Keras with Film-Conditioning.
+
+Related papers/blogs:
+- https://arxiv.org/abs/1512.03385
+- https://arxiv.org/pdf/1603.05027v2.pdf
+- http://torch.ch/blog/2016/02/04/resnets.html
+- https://arxiv.org/abs/1709.07871
+"""
+import tensorflow.compat.v2 as tf
+
+layers = tf.keras.layers
+
+
+class FilmConditioning(tf.keras.layers.Layer):
+  """Layer that adds FiLM conditioning.
+
+  This is intended to be applied after a convolutional layer. It will learn a
+  multiplicative and an additive factor to be applied to each channel of the
+  convolution's output.
+
+  Conv layer can be rank 2 or 4.
+
+  For further details, see: https://arxiv.org/abs/1709.07871
+  """
+
+  def __init__(self, num_channels: int):
+    """Constructs a FiLM conditioning layer.
+
+    Args:
+      num_channels: Number of filter channels to expect in the input.
+    """
+    super().__init__()
+    # Note that we initialize with zeros because empirically we have found
+    # this works better than initializing with glorot.
+    self._projection_add = layers.Dense(
+        num_channels,
+        activation=None,
+        kernel_initializer='zeros',
+        bias_initializer='zeros')
+    self._projection_mult = layers.Dense(
+        num_channels,
+        activation=None,
+        kernel_initializer='zeros',
+        bias_initializer='zeros')
+
+  def call(self, conv_filters: tf.Tensor, conditioning: tf.Tensor):
+    tf.debugging.assert_rank(conditioning, 2)
+    projected_cond_add = self._projection_add(conditioning)
+    projected_cond_mult = self._projection_mult(conditioning)
+
+    if len(conv_filters.shape) == 4:
+      # [B, D] -> [B, 1, 1, D]
+      projected_cond_add = projected_cond_add[:, tf.newaxis, tf.newaxis]
+      projected_cond_mult = projected_cond_mult[:, tf.newaxis, tf.newaxis]
+    else:
+      tf.debugging.assert_rank(conv_filters, 2)
+
+    # Original FiLM paper argues that 1 + gamma centers the initialization at
+    # identity transform.
+    result = (1 + projected_cond_mult) * conv_filters + projected_cond_add
+    return result
--- a/film_efficientnet/film_conditioning_layer_test.py
+++ b/film_efficientnet/film_conditioning_layer_test.py
@ -0,0 +1,40 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for film_conditioning_layer."""
+from absl.testing import parameterized
+import numpy as np
+from robotics_transformer.film_efficientnet import film_conditioning_layer
+import tensorflow as tf
+
+
+class FilmConditioningLayerTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters([2, 4])
+  def test_film_conditioning_rank_two_and_four(self, conv_rank):
+    batch = 2
+    num_channels = 3
+    if conv_rank == 2:
+      conv_layer = np.random.randn(batch, num_channels)
+    elif conv_rank == 4:
+      conv_layer = np.random.randn(batch, 1, 1, num_channels)
+    else:
+      raise ValueError(f'Unexpected conv rank: {conv_rank}')
+    context = np.random.rand(batch, num_channels)
+    film_layer = film_conditioning_layer.FilmConditioning(num_channels)
+    out = film_layer(conv_layer, context)
+    tf.debugging.assert_rank(out, conv_rank)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/film_efficientnet/film_efficientnet_encoder.py
+++ b/film_efficientnet/film_efficientnet_encoder.py
@ -0,0 +1,759 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pytype: skip-file
+# pylint: skip-file
+"""EfficientNet models modified with added film layers.
+
+Mostly copied from third_party/py/keras/applications/efficientnet.py
+"""
+
+import copy
+import math
+import os
+import warnings
+import json
+
+from absl import logging
+import tensorflow.compat.v2 as tf
+from tensorflow.keras import layers
+
+from robotics_transformer.film_efficientnet.film_conditioning_layer import FilmConditioning
+
+BASE_WEIGHTS_PATH = 'efficientnet_checkpoints/efficientnet'
+IMAGENET_JSON_PATH = 'efficientnet_checkpoints/imagenet_classes.json'
+CLASS_INDEX = None
+
+WEIGHTS_PATHS = {
+    'efficientnetb3': BASE_WEIGHTS_PATH + 'b3.h5',
+    'efficientnetb3_notop': BASE_WEIGHTS_PATH + 'b3_notop.h5',
+}
+
+DEFAULT_BLOCKS_ARGS = [{
+    'kernel_size': 3,
+    'repeats': 1,
+    'filters_in': 32,
+    'filters_out': 16,
+    'expand_ratio': 1,
+    'id_skip': True,
+    'strides': 1,
+    'se_ratio': 0.25
+}, {
+    'kernel_size': 3,
+    'repeats': 2,
+    'filters_in': 16,
+    'filters_out': 24,
+    'expand_ratio': 6,
+    'id_skip': True,
+    'strides': 2,
+    'se_ratio': 0.25
+}, {
+    'kernel_size': 5,
+    'repeats': 2,
+    'filters_in': 24,
+    'filters_out': 40,
+    'expand_ratio': 6,
+    'id_skip': True,
+    'strides': 2,
+    'se_ratio': 0.25
+}, {
+    'kernel_size': 3,
+    'repeats': 3,
+    'filters_in': 40,
+    'filters_out': 80,
+    'expand_ratio': 6,
+    'id_skip': True,
+    'strides': 2,
+    'se_ratio': 0.25
+}, {
+    'kernel_size': 5,
+    'repeats': 3,
+    'filters_in': 80,
+    'filters_out': 112,
+    'expand_ratio': 6,
+    'id_skip': True,
+    'strides': 1,
+    'se_ratio': 0.25
+}, {
+    'kernel_size': 5,
+    'repeats': 4,
+    'filters_in': 112,
+    'filters_out': 192,
+    'expand_ratio': 6,
+    'id_skip': True,
+    'strides': 2,
+    'se_ratio': 0.25
+}, {
+    'kernel_size': 3,
+    'repeats': 1,
+    'filters_in': 192,
+    'filters_out': 320,
+    'expand_ratio': 6,
+    'id_skip': True,
+    'strides': 1,
+    'se_ratio': 0.25
+}]
+
+CONV_KERNEL_INITIALIZER = {
+    'class_name': 'VarianceScaling',
+    'config': {
+        'scale': 2.0,
+        'mode': 'fan_out',
+        'distribution': 'truncated_normal'
+    }
+}
+
+DENSE_KERNEL_INITIALIZER = {
+    'class_name': 'VarianceScaling',
+    'config': {
+        'scale': 1. / 3.,
+        'mode': 'fan_out',
+        'distribution': 'uniform'
+    }
+}
+
+BASE_DOCSTRING = """Instantiates the {name} architecture.
+
+  Reference:
+  - [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](
+      https://arxiv.org/abs/1905.11946) (ICML 2019)
+
+  This function returns a Keras image classification model,
+  optionally loaded with weights pre-trained on ImageNet.
+
+  For image classification use cases, see
+  [this page for detailed examples](
+    https://keras.io/api/applications/#usage-examples-for-image-classification-models).
+
+  For transfer learning use cases, make sure to read the
+  [guide to transfer learning & fine-tuning](
+    https://keras.io/guides/transfer_learning/).
+
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For EfficientNet, input preprocessing is included as part of the model
+  (as a `Rescaling` layer), and thus
+  `tf.keras.applications.efficientnet.preprocess_input` is actually a
+  pass-through function. EfficientNet models expect their inputs to be float
+  tensors of pixels with values in the [0-255] range.
+
+  Args:
+    include_top: Whether to include the fully-connected
+        layer at the top of the network. Defaults to True.
+    weights: One of `None` (random initialization),
+          'imagenet' (pre-training on ImageNet),
+          or the path to the weights file to be loaded. Defaults to 'imagenet'.
+    input_tensor: Optional Keras tensor
+        (i.e. output of `layers.Input()`)
+        to use as image input for the model.
+    input_shape: Optional shape tuple, only to be specified
+        if `include_top` is False.
+        It should have exactly 3 inputs channels.
+    pooling: Optional pooling mode for feature extraction
+        when `include_top` is `False`. Defaults to None.
+        - `None` means that the output of the model will be
+            the 4D tensor output of the
+            last convolutional layer.
+        - `avg` means that global average pooling
+            will be applied to the output of the
+            last convolutional layer, and thus
+            the output of the model will be a 2D tensor.
+        - `max` means that global max pooling will
+            be applied.
+    classes: Optional number of classes to classify images
+        into, only to be specified if `include_top` is True, and
+        if no `weights` argument is specified. Defaults to 1000 (number of
+        ImageNet classes).
+    classifier_activation: A `str` or callable. The activation function to use
+        on the "top" layer. Ignored unless `include_top=True`. Set
+        `classifier_activation=None` to return the logits of the "top" layer.
+        Defaults to 'softmax'.
+        When loading pretrained weights, `classifier_activation` can only
+        be `None` or `"softmax"`.
+
+  Returns:
+    A `keras.Model` instance.
+"""
+
+IMAGENET_STDDEV_RGB = [0.229, 0.224, 0.225]
+
+
+def validate_activation(classifier_activation, weights):
+  """validates that the classifier is compatible with the weights.
+
+  Args:
+    classifier_activation: str or callable activation function
+    weights: The pretrained weights to load.
+
+  Raises:
+    ValueError: if an activation other than `None` or `softmax` are used with
+      pretrained weights.
+  """
+  if weights is None:
+    return
+
+  classifier_activation = tf.keras.activations.get(classifier_activation)
+  if classifier_activation not in {
+      tf.keras.activations.get('softmax'),
+      tf.keras.activations.get(None)
+  }:
+    raise ValueError('Only `None` and `softmax` activations are allowed '
+                     'for the `classifier_activation` argument when using '
+                     'pretrained weights, with `include_top=True`; Received: '
+                     f'classifier_activation={classifier_activation}')
+
+
+def correct_pad(inputs, kernel_size):
+  """Returns a tuple for zero-padding for 2D convolution with downsampling.
+
+  Args:
+    inputs: Input tensor.
+    kernel_size: An integer or tuple/list of 2 integers.
+
+  Returns:
+    A tuple.
+  """
+  img_dim = 2 if tf.keras.backend.image_data_format() == 'channels_first' else 1
+  input_size = tf.keras.backend.int_shape(inputs)[img_dim:(img_dim + 2)]
+  if isinstance(kernel_size, int):
+    kernel_size = (kernel_size, kernel_size)
+  if input_size[0] is None:
+    adjust = (1, 1)
+  else:
+    adjust = (1 - input_size[0] % 2, 1 - input_size[1] % 2)
+  correct = (kernel_size[0] // 2, kernel_size[1] // 2)
+  return ((correct[0] - adjust[0], correct[0]), (correct[1] - adjust[1],
+                                                 correct[1]))
+
+
+def obtain_input_shape(input_shape,
+                       default_size,
+                       min_size,
+                       data_format,
+                       require_flatten,
+                       weights=None):
+  """Internal utility to compute/validate a model's input shape.
+
+  Args:
+    input_shape: Either None (will return the default network input shape), or a
+      user-provided shape to be validated.
+    default_size: Default input width/height for the model.
+    min_size: Minimum input width/height accepted by the model.
+    data_format: Image data format to use.
+    require_flatten: Whether the model is expected to be linked to a classifier
+      via a Flatten layer.
+    weights: One of `None` (random initialization) or 'imagenet' (pre-training
+      on ImageNet). If weights='imagenet' input channels must be equal to 3.
+
+  Returns:
+    An integer shape tuple (may include None entries).
+
+  Raises:
+    ValueError: In case of invalid argument values.
+  """
+  if weights != 'imagenet' and input_shape and len(input_shape) == 3:
+    if data_format == 'channels_first':
+      if input_shape[0] not in {1, 3}:
+        warnings.warn(
+            'This model usually expects 1 or 3 input channels. '
+            'However, it was passed an input_shape with ' +
+            str(input_shape[0]) + ' input channels.',
+            stacklevel=2)
+      default_shape = (input_shape[0], default_size, default_size)
+    else:
+      if input_shape[-1] not in {1, 3}:
+        warnings.warn(
+            'This model usually expects 1 or 3 input channels. '
+            'However, it was passed an input_shape with ' +
+            str(input_shape[-1]) + ' input channels.',
+            stacklevel=2)
+      default_shape = (default_size, default_size, input_shape[-1])
+  else:
+    if data_format == 'channels_first':
+      default_shape = (3, default_size, default_size)
+    else:
+      default_shape = (default_size, default_size, 3)
+  if weights == 'imagenet' and require_flatten:
+    if input_shape is not None:
+      if input_shape != default_shape:
+        raise ValueError('When setting `include_top=True` '
+                         'and loading `imagenet` weights, '
+                         f'`input_shape` should be {default_shape}.  '
+                         f'Received: input_shape={input_shape}')
+    return default_shape
+  if input_shape:
+    if data_format == 'channels_first':
+      if input_shape is not None:
+        if len(input_shape) != 3:
+          raise ValueError('`input_shape` must be a tuple of three integers.')
+        if input_shape[0] != 3 and weights == 'imagenet':
+          raise ValueError('The input must have 3 channels; Received '
+                           f'`input_shape={input_shape}`')
+        if ((input_shape[1] is not None and input_shape[1] < min_size) or
+            (input_shape[2] is not None and input_shape[2] < min_size)):
+          raise ValueError(f'Input size must be at least {min_size}'
+                           f'x{min_size}; Received: '
+                           f'input_shape={input_shape}')
+    else:
+      if input_shape is not None:
+        if len(input_shape) != 3:
+          raise ValueError('`input_shape` must be a tuple of three integers.')
+        if input_shape[-1] != 3 and weights == 'imagenet':
+          raise ValueError('The input must have 3 channels; Received '
+                           f'`input_shape={input_shape}`')
+        if ((input_shape[0] is not None and input_shape[0] < min_size) or
+            (input_shape[1] is not None and input_shape[1] < min_size)):
+          raise ValueError('Input size must be at least '
+                           f'{min_size}x{min_size}; Received: '
+                           f'input_shape={input_shape}')
+  else:
+    if require_flatten:
+      input_shape = default_shape
+    else:
+      if data_format == 'channels_first':
+        input_shape = (3, None, None)
+      else:
+        input_shape = (None, None, 3)
+  if require_flatten:
+    if None in input_shape:
+      raise ValueError('If `include_top` is True, '
+                       'you should specify a static `input_shape`. '
+                       f'Received: input_shape={input_shape}')
+  return input_shape
+
+
+def EfficientNet(width_coefficient,
+                 depth_coefficient,
+                 default_size,
+                 dropout_rate=0.2,
+                 drop_connect_rate=0.2,
+                 depth_divisor=8,
+                 activation='swish',
+                 blocks_args='default',
+                 model_name='efficientnet',
+                 include_top=True,
+                 weights='imagenet',
+                 input_tensor=None,
+                 input_shape=None,
+                 pooling=None,
+                 classes=1000,
+                 classifier_activation='softmax',
+                 include_film=False):
+  """Instantiates the EfficientNet architecture using given scaling coefficients.
+
+  Args:
+    width_coefficient: float, scaling coefficient for network width.
+    depth_coefficient: float, scaling coefficient for network depth.
+    default_size: integer, default input image size.
+    dropout_rate: float, dropout rate before final classifier layer.
+    drop_connect_rate: float, dropout rate at skip connections.
+    depth_divisor: integer, a unit of network width.
+    activation: activation function.
+    blocks_args: list of dicts, parameters to construct block modules.
+    model_name: string, model name.
+    include_top: whether to include the fully-connected layer at the top of the
+      network.
+    weights: one of `None` (random initialization), 'imagenet' (pre-training on
+      ImageNet), or the path to the weights file to be loaded.
+    input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) to use
+      as image input for the model.
+    input_shape: optional shape tuple, only to be specified if `include_top` is
+      False. It should have exactly 3 inputs channels.
+    pooling: optional pooling mode for feature extraction when `include_top` is
+      `False`. - `None` means that the output of the model will be the 4D tensor
+      output of the last convolutional layer. - `avg` means that global average
+      pooling will be applied to the output of the last convolutional layer, and
+      thus the output of the model will be a 2D tensor. - `max` means that
+      global max pooling will be applied.
+    classes: optional number of classes to classify images into, only to be
+      specified if `include_top` is True, and if no `weights` argument is
+      specified.
+    classifier_activation: A `str` or callable. The activation function to use
+      on the "top" layer. Ignored unless `include_top=True`. Set
+      `classifier_activation=None` to return the logits of the "top" layer.
+    include_film: bool, whether or not to insert film conditioning layers.
+
+  Returns:
+    A `keras.Model` instance.
+
+  Raises:
+    ValueError: in case of invalid argument for `weights`,
+      or invalid input shape.
+    ValueError: if `classifier_activation` is not `softmax` or `None` when
+      using a pretrained top layer.
+  """
+  if blocks_args == 'default':
+    blocks_args = DEFAULT_BLOCKS_ARGS
+
+  if not (weights in {'imagenet', None} or tf.io.gfile.exists(weights)):
+    raise ValueError('The `weights` argument should be either '
+                     '`None` (random initialization), `imagenet` '
+                     '(pre-training on ImageNet), '
+                     'or the path to the weights file to be loaded.')
+
+  if weights == 'imagenet' and include_top and classes != 1000:
+    raise ValueError('If using `weights` as `"imagenet"` with `include_top`'
+                     ' as true, `classes` should be 1000')
+
+  # Determine proper input shape
+  input_shape = obtain_input_shape(
+      input_shape,
+      default_size=default_size,
+      min_size=32,
+      data_format=tf.keras.backend.image_data_format(),
+      require_flatten=include_top,
+      weights=weights)
+
+  if include_film:
+    with tf.compat.v1.variable_scope('context_input'):
+      context_input = layers.Input(shape=512)
+  if input_tensor is None:
+    img_input = layers.Input(shape=input_shape)
+  else:
+    if not tf.keras.backend.is_keras_tensor(input_tensor):
+      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
+    else:
+      img_input = input_tensor
+
+  bn_axis = 3 if tf.keras.backend.image_data_format() == 'channels_last' else 1
+
+  def round_filters(filters, divisor=depth_divisor):
+    """Round number of filters based on depth multiplier."""
+    filters *= width_coefficient
+    new_filters = max(divisor, int(filters + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_filters < 0.9 * filters:
+      new_filters += divisor
+    return int(new_filters)
+
+  def round_repeats(repeats):
+    """Round number of repeats based on depth multiplier."""
+    return int(math.ceil(depth_coefficient * repeats))
+
+  # Build stem
+  x = img_input
+  x = layers.Rescaling(1. / 255.)(x)
+  x = layers.Normalization(axis=bn_axis)(x)
+  # Note that the normaliztion layer uses square value of STDDEV as the
+  # variance for the layer: result = (input - mean) / sqrt(var)
+  # However, the original implemenetation uses (input - mean) / var to
+  # normalize the input, we need to divide another sqrt(var) to match the
+  # original implementation.
+  # See https://github.com/tensorflow/tensorflow/issues/49930 for more details
+  # We always apply this transformation, even when not using imagenet weights,
+  # because it needs to be in the graph when grafting weights from imagenet
+  # pretrained models.
+  x = layers.Rescaling(1. / tf.math.sqrt(IMAGENET_STDDEV_RGB))(x)
+
+  x = layers.ZeroPadding2D(padding=correct_pad(x, 3), name='stem_conv_pad')(x)
+  x = layers.Conv2D(
+      round_filters(32),
+      3,
+      strides=2,
+      padding='valid',
+      use_bias=False,
+      kernel_initializer=CONV_KERNEL_INITIALIZER,
+      name='stem_conv')(
+          x)
+  x = layers.BatchNormalization(axis=bn_axis, name='stem_bn')(x)
+  x = layers.Activation(activation, name='stem_activation')(x)
+
+  # Build blocks
+  blocks_args = copy.deepcopy(blocks_args)
+
+  b = 0
+  blocks = float(sum(round_repeats(args['repeats']) for args in blocks_args))
+  for (i, args) in enumerate(blocks_args):
+    assert args['repeats'] > 0
+    # Update block input and output filters based on depth multiplier.
+    args['filters_in'] = round_filters(args['filters_in'])
+    args['filters_out'] = round_filters(args['filters_out'])
+
+    for j in range(round_repeats(args.pop('repeats'))):
+      # The first block needs to take care of stride and filter size increase.
+      if j > 0:
+        args['strides'] = 1
+        args['filters_in'] = args['filters_out']
+      x = block(
+          x,
+          activation,
+          drop_connect_rate * b / blocks,
+          name='block{}{}_'.format(i + 1, chr(j + 97)),
+          **args)
+      if include_film:
+        with tf.compat.v1.variable_scope('film_conditioning'):
+          x = FilmConditioning(num_channels=x.shape[-1])(x, context_input)
+      b += 1
+
+  # Build top
+  x = layers.Conv2D(
+      round_filters(1280),
+      1,
+      padding='same',
+      use_bias=False,
+      kernel_initializer=CONV_KERNEL_INITIALIZER,
+      name='top_conv')(
+          x)
+  x = layers.BatchNormalization(axis=bn_axis, name='top_bn')(x)
+  x = layers.Activation(activation, name='top_activation')(x)
+  if include_top:
+    x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
+    if dropout_rate > 0:
+      x = layers.Dropout(dropout_rate, name='top_dropout')(x)
+    validate_activation(classifier_activation, weights)
+    x = layers.Dense(
+        classes,
+        activation=classifier_activation,
+        kernel_initializer=DENSE_KERNEL_INITIALIZER,
+        name='predictions')(
+            x)
+  else:
+    if pooling == 'avg':
+      x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
+    elif pooling == 'max':
+      x = layers.GlobalMaxPooling2D(name='max_pool')(x)
+
+  # Ensure that the model takes into account
+  # any potential predecessors of `input_tensor`.
+  if input_tensor is not None:
+    inputs = tf.keras.utils.get_source_inputs(input_tensor)
+  else:
+    inputs = img_input
+    if include_film:
+      inputs = (img_input, context_input)
+
+  # Create model.
+  model = tf.keras.Model(inputs, x, name=model_name)
+
+  # Load weights.
+  if weights == 'imagenet':
+    if include_top:
+      key = model_name
+    else:
+      key = model_name + '_notop'
+    weights_path = os.path.join(os.path.dirname(__file__), WEIGHTS_PATHS[key])
+    model.load_weights(weights_path, skip_mismatch=False, by_name=False)
+  elif weights is not None:
+    model.load_weights(weights, skip_mismatch=False, by_name=False)
+  return model
+
+
+def block(inputs,
+          activation='swish',
+          drop_rate=0.,
+          name='',
+          filters_in=32,
+          filters_out=16,
+          kernel_size=3,
+          strides=1,
+          expand_ratio=1,
+          se_ratio=0.,
+          id_skip=True):
+  """An inverted residual block.
+
+  Args:
+      inputs: input tensor.
+      activation: activation function.
+      drop_rate: float between 0 and 1, fraction of the input units to drop.
+      name: string, block label.
+      filters_in: integer, the number of input filters.
+      filters_out: integer, the number of output filters.
+      kernel_size: integer, the dimension of the convolution window.
+      strides: integer, the stride of the convolution.
+      expand_ratio: integer, scaling coefficient for the input filters.
+      se_ratio: float between 0 and 1, fraction to squeeze the input filters.
+      id_skip: boolean.
+
+  Returns:
+      output tensor for the block.
+  """
+  bn_axis = 3 if tf.keras.backend.image_data_format() == 'channels_last' else 1
+
+  # Expansion phase
+  filters = filters_in * expand_ratio
+  if expand_ratio != 1:
+    x = layers.Conv2D(
+        filters,
+        1,
+        padding='same',
+        use_bias=False,
+        kernel_initializer=CONV_KERNEL_INITIALIZER,
+        name=name + 'expand_conv')(
+            inputs)
+    x = layers.BatchNormalization(axis=bn_axis, name=name + 'expand_bn')(x)
+    x = layers.Activation(activation, name=name + 'expand_activation')(x)
+  else:
+    x = inputs
+
+  # Depthwise Convolution
+  if strides == 2:
+    x = layers.ZeroPadding2D(
+        padding=correct_pad(x, kernel_size), name=name + 'dwconv_pad')(
+            x)
+    conv_pad = 'valid'
+  else:
+    conv_pad = 'same'
+  x = layers.DepthwiseConv2D(
+      kernel_size,
+      strides=strides,
+      padding=conv_pad,
+      use_bias=False,
+      depthwise_initializer=CONV_KERNEL_INITIALIZER,
+      name=name + 'dwconv')(
+          x)
+  x = layers.BatchNormalization(axis=bn_axis, name=name + 'bn')(x)
+  x = layers.Activation(activation, name=name + 'activation')(x)
+
+  # Squeeze and Excitation phase
+  if 0 < se_ratio <= 1:
+    filters_se = max(1, int(filters_in * se_ratio))
+    se = layers.GlobalAveragePooling2D(name=name + 'se_squeeze')(x)
+    if bn_axis == 1:
+      se_shape = (filters, 1, 1)
+    else:
+      se_shape = (1, 1, filters)
+    se = layers.Reshape(se_shape, name=name + 'se_reshape')(se)
+    se = layers.Conv2D(
+        filters_se,
+        1,
+        padding='same',
+        activation=activation,
+        kernel_initializer=CONV_KERNEL_INITIALIZER,
+        name=name + 'se_reduce')(
+            se)
+    se = layers.Conv2D(
+        filters,
+        1,
+        padding='same',
+        activation='sigmoid',
+        kernel_initializer=CONV_KERNEL_INITIALIZER,
+        name=name + 'se_expand')(
+            se)
+    x = layers.multiply([x, se], name=name + 'se_excite')
+
+  # Output phase
+  x = layers.Conv2D(
+      filters_out,
+      1,
+      padding='same',
+      use_bias=False,
+      kernel_initializer=CONV_KERNEL_INITIALIZER,
+      name=name + 'project_conv')(
+          x)
+  x = layers.BatchNormalization(axis=bn_axis, name=name + 'project_bn')(x)
+  if id_skip and strides == 1 and filters_in == filters_out:
+    if drop_rate > 0:
+      x = layers.Dropout(
+          drop_rate, noise_shape=(None, 1, 1, 1), name=name + 'drop')(
+              x)
+    x = layers.add([x, inputs], name=name + 'add')
+  return x
+
+
+def maybe_restore_with_film(
+    *args,
+    weights='imagenet',
+    include_film=False,
+    **kwargs,
+):
+  n1 = EfficientNet(*args, weights=weights, include_film=False, **kwargs)
+  if not include_film:
+    return n1
+  # Copy the model weights over to a new model. This is necessary
+  # in case we have inserted early film layers. In this case,
+  # the pretrained weights will fail to restore properly
+  # unless we do this trick.
+  n2 = EfficientNet(*args, weights=None, include_film=True, **kwargs)
+  # The layers without the film layers.
+  l1 = {l.name: l for l in n1.layers}
+  # The layers with the film layers.
+  l2 = {l.name: l for l in n2.layers}
+  for layer_name, layer in l2.items():
+    if layer_name in l1:
+      layer.set_weights(l1[layer_name].get_weights())
+    # Annoyingly, the rescaling and normalization layers get different names
+    # in each graph.
+    elif 'rescaling' in layer_name:
+      _, num = layer_name.split('_')
+      l1_layer_name = 'rescaling_' + str(int(num) - 2 or '')
+      l1_layer_name = l1_layer_name.rstrip('_')
+      layer.set_weights(l1[l1_layer_name].get_weights())
+    elif 'normalization' in layer_name:
+      _, num = layer_name.split('_')
+      l1_layer_name = 'normalization_' + str(int(num) - 1 or '')
+      l1_layer_name = l1_layer_name.rstrip('_')
+      layer.set_weights(l1[l1_layer_name].get_weights())
+  return n2
+
+
+def EfficientNetB3(include_top=True,
+                   weights='imagenet',
+                   input_tensor=None,
+                   input_shape=None,
+                   pooling=None,
+                   classes=1000,
+                   classifier_activation='softmax',
+                   include_film=False,
+                   **kwargs):
+  return maybe_restore_with_film(
+      1.2,
+      1.4,
+      300,
+      0.3,
+      model_name='efficientnetb3',
+      include_top=include_top,
+      weights=weights,
+      input_tensor=input_tensor,
+      input_shape=input_shape,
+      pooling=pooling,
+      classes=classes,
+      classifier_activation=classifier_activation,
+      include_film=include_film,
+      **kwargs)
+
+
+EfficientNetB3.__doc__ = BASE_DOCSTRING.format(name='EfficientNetB3')
+
+
+def preprocess_input(x, data_format=None):  # pylint: disable=unused-argument
+  """A placeholder method for backward compatibility.
+
+  The preprocessing logic has been included in the efficientnet model
+  implementation. Users are no longer required to call this method to normalize
+  the input data. This method does nothing and only kept as a placeholder to
+  align the API surface between old and new version of model.
+
+  Args:
+    x: A floating point `numpy.array` or a `tf.Tensor`.
+    data_format: Optional data format of the image tensor/array. Defaults to
+      None, in which case the global setting `tf.keras.image_data_format() is
+      used (unless you changed it, it defaults to "channels_last").{mode}
+
+  Returns:
+    Unchanged `numpy.array` or `tf.Tensor`.
+  """
+  return x
+
+
+def decode_predictions(preds, top=5):
+  global CLASS_INDEX
+  if CLASS_INDEX is None:
+    with open(os.path.join(os.path.dirname(__file__), IMAGENET_JSON_PATH)) as f:
+      CLASS_INDEX = json.load(f)
+  results = []
+  for pred in preds:
+    top_indices = pred.argsort()[-top:][::-1]
+    result = [tuple(CLASS_INDEX[str(i)]) + (pred[i],) for i in top_indices]
+    result.sort(key=lambda x: x[2], reverse=True)
+    results.append(result)
+  return results
--- a/film_efficientnet/film_efficientnet_encoder_test.py
+++ b/film_efficientnet/film_efficientnet_encoder_test.py
@ -0,0 +1,73 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests that film_efficientnet can detect an image of a cat."""
+
+from absl.testing import parameterized
+import numpy as np
+from robotics_transformer.film_efficientnet import film_efficientnet_encoder
+from skimage import data
+import tensorflow as tf
+
+
+class FilmEfficientnetTest(tf.test.TestCase, parameterized.TestCase):
+
+  def _helper(self, include_film, model_variant):
+    if model_variant == 'b0':
+      size = 224
+      fe = film_efficientnet_encoder.EfficientNetB0
+    elif model_variant == 'b1':
+      size = 240
+      fe = film_efficientnet_encoder.EfficientNetB1
+    elif model_variant == 'b2':
+      size = 260
+      fe = film_efficientnet_encoder.EfficientNetB2
+    elif model_variant == 'b3':
+      size = 300
+      fe = film_efficientnet_encoder.EfficientNetB3
+    elif model_variant == 'b4':
+      size = 380
+      fe = film_efficientnet_encoder.EfficientNetB4
+    elif model_variant == 'b5':
+      size = 456
+      fe = film_efficientnet_encoder.EfficientNetB5
+    elif model_variant == 'b6':
+      size = 528
+      fe = film_efficientnet_encoder.EfficientNetB6
+    elif model_variant == 'b7':
+      size = 600
+      fe = film_efficientnet_encoder.EfficientNetB7
+    else:
+      raise ValueError(f'Unknown variant: {model_variant}')
+    fe = fe(include_top=True, weights='imagenet', include_film=include_film)
+    image = np.expand_dims(data.chelsea(), axis=0)
+    image = tf.image.resize(image, (size, size))
+    context = np.random.randn(1, 512)
+    if include_film:
+      eff_output = fe(
+          (film_efficientnet_encoder.preprocess_input(image), context),
+          training=False)
+    else:
+      eff_output = fe(
+          film_efficientnet_encoder.preprocess_input(image), training=False)
+    film_preds = film_efficientnet_encoder.decode_predictions(
+        eff_output.numpy(), top=10)
+    self.assertIn('tabby', [f[1] for f in film_preds[0]])
+
+  @parameterized.parameters([True, False])
+  def test_keras_equivalence_b3(self, include_film):
+    self._helper(include_film, 'b3')
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/film_efficientnet/preprocessors.py
+++ b/film_efficientnet/preprocessors.py
@ -0,0 +1,108 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Preprocessing functions for transforming the image for training."""
+
+from typing import Optional
+
+import gin
+import tensorflow.compat.v2 as tf
+
+CROP_SIZE = 472
+
+
+@gin.configurable(
+    denylist=['images', 'crop_size', 'training', 'convert_dtype', 'seed'])
+def convert_dtype_and_crop_images(images,
+                                  crop_size: int = CROP_SIZE,
+                                  training: bool = True,
+                                  pad_then_crop: bool = False,
+                                  convert_dtype: bool = True,
+                                  seed: Optional[tf.Tensor] = None):
+  """Convert uint8 [512, 640, 3] images to float32 and square crop.
+
+  Args:
+    images: [B, H, W, 3] uint8 tensor of images.
+    crop_size: Width of the square crop.
+    training: If we are in training (random crop) or not-training (fixed crop).
+    pad_then_crop: If True, pads image and then crops the original image size.
+      This allows full field of view to be extracted.
+    convert_dtype: whether or not to convert the image to float32 in the range
+      of (0, 1).
+    seed: Optional seed of shape (2,) for giving to tf.random.stateless_uniform
+
+  Returns:
+    [B, crop_size, crop_size, 3] images of dtype float32.
+  """
+
+  if seed is None:
+    seed = tf.random.uniform(shape=(2,), maxval=2**30, dtype=tf.int32)
+
+  seed2 = tf.random.experimental.stateless_split(seed, num=1)[0]
+
+  if convert_dtype:
+    images = tf.image.convert_image_dtype(images, tf.float32)
+  image_height = images.get_shape().as_list()[-3]
+  image_width = images.get_shape().as_list()[-2]
+
+  if pad_then_crop:
+
+    if training:
+      if image_height == 512:
+        ud_pad = 40
+        lr_pad = 100
+      elif image_height == 256:
+        ud_pad = 20
+        lr_pad = 50
+      else:
+        raise ValueError(
+            'convert_dtype_and_crop_images only supports image height 512 or '
+            '256.')
+      max_y = 2 * ud_pad
+      max_x = 2 * lr_pad
+      images = tf.image.pad_to_bounding_box(
+          images,
+          offset_height=ud_pad,
+          offset_width=lr_pad,
+          target_height=image_height + 2 * ud_pad,
+          target_width=image_width + 2 * lr_pad)
+      offset_y = tf.random.stateless_uniform((),
+                                             maxval=max_y + 1,
+                                             dtype=tf.int32,
+                                             seed=seed)
+      offset_x = tf.random.stateless_uniform((),
+                                             maxval=max_x + 1,
+                                             dtype=tf.int32,
+                                             seed=seed2)
+      images = tf.image.crop_to_bounding_box(images, offset_y, offset_x,
+                                             image_height, image_width)
+  else:
+    # Standard cropping.
+    max_y = image_height - crop_size
+    max_x = image_width - crop_size
+
+    if training:
+      offset_y = tf.random.stateless_uniform((),
+                                             maxval=max_y + 1,
+                                             dtype=tf.int32,
+                                             seed=seed)
+      offset_x = tf.random.stateless_uniform((),
+                                             maxval=max_x + 1,
+                                             dtype=tf.int32,
+                                             seed=seed2)
+      images = tf.image.crop_to_bounding_box(images, offset_y, offset_x,
+                                             crop_size, crop_size)
+    else:
+      images = tf.image.crop_to_bounding_box(images, max_y // 2, max_x // 2,
+                                             crop_size, crop_size)
+  return images
--- a/film_efficientnet/preprocessors_test.py
+++ b/film_efficientnet/preprocessors_test.py
@ -0,0 +1,83 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for preprocessors."""
+from absl.testing import parameterized
+import numpy as np
+from robotics_transformer.film_efficientnet import preprocessors
+from tensor2robot.utils import tensorspec_utils
+import tensorflow.compat.v2 as tf
+
+
+def _random_image(shape):
+  images = tf.random.uniform(
+      shape, minval=0, maxval=255, dtype=tf.dtypes.int32, seed=42)
+  return tf.cast(images, tf.uint8)
+
+
+def _get_features(
+    image_shape=(2, 512, 640, 3), use_task_image=False, use_goal_image=False):
+  # Time-dimension stacking occurs during training but not eval.
+  state = tensorspec_utils.TensorSpecStruct(image=_random_image(image_shape))
+  if use_task_image:
+    state.task_image = _random_image(image_shape)
+  if use_goal_image:
+    state.goal_image = _random_image(image_shape)
+  return state
+
+
+class PreprocessorsTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters((True, False, False), (False, True, False),
+                            (True, False, True), (False, True, True))
+  def testConvertDtypeAndCropImages(self, training, pad_then_crop,
+                                    convert_dtype):
+    features = _get_features()
+    images = preprocessors.convert_dtype_and_crop_images(
+        features.image,
+        training=training,
+        pad_then_crop=pad_then_crop,
+        convert_dtype=convert_dtype)
+    expected_cropped_shape = ([2, 512, 640, 3]
+                              if pad_then_crop else [2, 472, 472, 3])
+    tf.ensure_shape(images, expected_cropped_shape)
+    if convert_dtype:
+      self.assertEqual(images.dtype, tf.float32)
+      self.assertLessEqual(images.numpy().max(), 1.)
+      self.assertGreaterEqual(images.numpy().min(), 0.)
+    else:
+      self.assertEqual(images.dtype, tf.uint8)
+      self.assertLessEqual(images.numpy().max(), 255)
+      self.assertGreaterEqual(images.numpy().min(), 0)
+      self.assertGreater(images.numpy().max(), 1)
+
+  def testConvertDtypeAndCropImagesSeeded(self):
+    features = _get_features()
+    seed = tf.constant([1, 2], tf.int32)
+    images1 = preprocessors.convert_dtype_and_crop_images(
+        features.image, training=True, pad_then_crop=True, seed=seed)
+    images2 = preprocessors.convert_dtype_and_crop_images(
+        features.image, training=True, pad_then_crop=True, seed=seed)
+    diff = np.sum(np.abs(images1.numpy() - images2.numpy()))
+    self.assertAlmostEqual(diff, 0)
+
+  def testConvertDtypeAndCropImagesUnseeded(self):
+    features = _get_features()
+    seed1 = tf.constant([1, 2], tf.int32)
+    images1 = preprocessors.convert_dtype_and_crop_images(
+        features.image, training=True, pad_then_crop=True, seed=seed1)
+    seed2 = tf.constant([2, 3], tf.int32)
+    images2 = preprocessors.convert_dtype_and_crop_images(
+        features.image, training=True, pad_then_crop=True, seed=seed2)
+    diff = np.sum(np.abs(images1.numpy() - images2.numpy()))
+    self.assertNotAlmostEqual(diff, 0)
--- a/film_efficientnet/pretrained_efficientnet_encoder.py
+++ b/film_efficientnet/pretrained_efficientnet_encoder.py
@ -0,0 +1,122 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Encoder based on Efficientnet."""
+
+from typing import Optional
+
+import gin
+from robotics_transformer.film_efficientnet import film_conditioning_layer
+from robotics_transformer.film_efficientnet import film_efficientnet_encoder
+import tensorflow as tf
+
+_MODELS = {
+    'b3': film_efficientnet_encoder.EfficientNetB3,
+}
+
+_SIZES = {
+    'b3': 300,
+}
+
+
+@gin.configurable
+class EfficientNetEncoder(tf.keras.layers.Layer):
+  """Applies a pretrained Efficientnet based encoder."""
+
+  def __init__(self,
+               model_variant: str = 'b3',
+               freeze: bool = False,
+               early_film: bool = True,
+               weights: Optional[str] = 'imagenet',
+               include_top: bool = False,
+               pooling: bool = True,
+               **kwargs):
+    """Initialize the model.
+
+    Args:
+      model_variant: One of 'b0-b7' of the efficient encoders. See
+        https://arxiv.org/abs/1905.11946 to understand the variants.
+      freeze: Whether or not to freeze the pretrained weights (seems to not work
+        well).
+      early_film: Whether to inject film layers into the efficientnet encoder
+        (seems to be essential to getting strong performance).
+      weights: Which pretrained weights to use. Either 'imagenet', a path to the
+        pretrained weights, or None for from scratch.
+      include_top: Whether to add the top fully connected layer. If True, this
+        will cause encoding to fail and is used only for unit testing purposes.
+      pooling: If false, returns feature map before global average pooling
+      **kwargs: Keras specific layer kwargs.
+    """
+    super(EfficientNetEncoder, self).__init__(**kwargs)
+    if model_variant not in _MODELS:
+      raise ValueError(f'Unknown variant {model_variant}')
+    self.model_variant = model_variant
+    self.early_film = early_film
+    self.freeze = freeze
+    self.conv1x1 = tf.keras.layers.Conv2D(
+        filters=512,
+        kernel_size=(1, 1),
+        strides=(1, 1),
+        padding='SAME',
+        use_bias=False,
+        kernel_initializer=tf.keras.initializers.VarianceScaling())
+    self.net = _MODELS[model_variant](
+        include_top=include_top,
+        weights=weights,
+        include_film=early_film,
+    )
+    self.film_layer = film_conditioning_layer.FilmConditioning(num_channels=512)
+    self._pooling = pooling
+
+  def _prepare_image(self, image: tf.Tensor) -> tf.Tensor:
+    """Resize the input image and check that the range is correct."""
+    if len(image.shape) != 4 or image.shape[-1] != 3:
+      raise ValueError('Provided image should have shape (b, h, w, 3).')
+    size = _SIZES[self.model_variant]
+    if image.shape[1] < size / 4 or image.shape[2] < size / 4:
+      raise ValueError('Provided image is too small.')
+    if image.shape[1] > size * 4 or image.shape[2] > size * 4:
+      raise ValueError('Provided image is too large.')
+    image = tf.image.resize(image, (size, size))
+    c1 = tf.Assert(tf.reduce_max(image) <= 1, data=[tf.reduce_max(image)])
+    c2 = tf.Assert(tf.reduce_min(image) >= 0, data=[tf.reduce_min(image)])
+    with tf.control_dependencies([c1, c2]):
+      image *= 255  # The image is expected to be in range(0, 255).
+      image = film_efficientnet_encoder.preprocess_input(image)
+      return image
+
+  def _encode(self, image: tf.Tensor, context: tf.Tensor,
+              training: bool) -> tf.Tensor:
+    """Run the image through the efficientnet encoder."""
+    image = self._prepare_image(image)
+    if self.early_film:
+      return self.net((image, context), training=training)
+    return self.net(image, training=training)
+
+  def call(self,
+           image: tf.Tensor,
+           context: Optional[tf.Tensor] = None,
+           training: bool = True) -> tf.Tensor:
+    if self.freeze:
+      features = tf.stop_gradient(self._encode(image, context, training))
+    else:
+      features = self._encode(image, context, training)
+    if context is not None:
+      features = self.conv1x1(features)
+      features = self.film_layer(features, context)
+
+    if not self._pooling:
+      return features
+
+    # Global average pool.
+    return tf.reduce_mean(features, [1, 2])
--- a/film_efficientnet/pretrained_efficientnet_encoder_test.py
+++ b/film_efficientnet/pretrained_efficientnet_encoder_test.py
@ -0,0 +1,49 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for pretrained_efficientnet_encoder."""
+
+import numpy as np
+from robotics_transformer.film_efficientnet import film_efficientnet_encoder
+from robotics_transformer.film_efficientnet import pretrained_efficientnet_encoder as eff
+from skimage import data
+import tensorflow as tf
+
+
+class PretrainedEfficientnetEncoderTest(tf.test.TestCase):
+
+  def test_encoding(self):
+    """Test that we get a correctly shaped decoding."""
+    state = np.random.RandomState(0)
+    context = state.uniform(-1, 1, (10, 512))
+    model = eff.EfficientNetEncoder()
+    image = np.expand_dims(data.chelsea(), axis=0) / 255
+    preds = model(image, context, training=False).numpy()
+    self.assertEqual(preds.shape, (10, 512))
+
+  def test_imagenet_classification(self):
+    """Test that we can correctly classify an image of a cat."""
+    state = np.random.RandomState(0)
+    context = state.uniform(-1, 1, (10, 512))
+    model = eff.EfficientNetEncoder(include_top=True)
+    image = np.expand_dims(data.chelsea(), axis=0) / 255
+    preds = model._encode(image, context, training=False).numpy()
+    predicted_names = [
+        n[1]
+        for n in film_efficientnet_encoder.decode_predictions(preds, top=3)[0]
+    ]
+    self.assertIn('tabby', predicted_names)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,44 @@
+[project]
+name = "robotics_transformer"
+description = ""
+readme = "README.md"
+requires-python = ">=3.7"
+license = {file = "LICENSE"}
+authors = [{name = "robotics_transformer authors", email="robotics_transformer@google.com"}]
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "License :: OSI Approved :: Apache Software License",
+    "Intended Audience :: Science/Research",
+]
+keywords = []
+
+# pip dependencies of the project
+dependencies = []
+
+# This is set automatically by flit using `robotics_transformer.__version__`
+dynamic = ["version"]
+
+[project.urls]
+homepage = "https://github.com/google-research/robotics_transformer"
+repository = "https://github.com/google-research/robotics_transformer"
+# Other: `documentation`, `changelog`
+
+[project.optional-dependencies]
+# Development deps (unittest, linting, formating,...)
+# Installed through `pip install .[dev]`
+dev = [
+    "pytest",
+    "pytest-xdist",
+    "pylint>=2.6.0",
+    "pyink",
+]
+
+[tool.pyink]
+# Formatting configuration to follow Google style-guide
+pyink-indentation = 2
+pyink-use-majority-quotes = true
+
+[build-system]
+requires = ["flit_core >=3.5,<4"]
+build-backend = "flit_core.buildapi"
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,9 @@
+absl-py>=0.5.0
+numpy>=1.13.3
+tensorflow>=1.13.0
+tensorflow-serving-api>=1.13.0
+gin-config>=0.1.4
+tensorflow-probability>=0.6.0
+tf-agents>=0.3.0
+tf-slim>=1.0
+git+https://github.com/google-research/tensor2robot#tensor2robot
--- a/robotics_transformer.blueprint
+++ b/robotics_transformer.blueprint
@ -0,0 +1,19 @@
+include "devtools/blueprint/bluze/public/bluze.ncl";
+include bytes "third_party/py/robotics_transformer/bluze.textproto" as textproto;
+
+// See go/bluze/guide before editing. To check the generated final blueprint run
+// rncl third_party/py/robotics_transformer/robotics_transformer.blueprint printproto blueprint_file
+
+blueprint_file = ::bluze::BlueprintFile(
+  textproto,
+
+  project_name = "robotics_transformer",
+  teams_product_id = 9019942154,
+  tech_lead = ["keerthanapg"],
+  dev_mailing_list = "robotics_transformer-automated@google.com",
+  mdb_groups = ["robotics"],
+  buganizer_component_ids = [1150225],
+  metadata_path = "//depot/google3/third_party/py/robotics_transformer/METADATA",
+
+// Customize your blueprint here: go/blueprint/howto-write.
+);
--- a/sequence_agent.py
+++ b/sequence_agent.py
@ -0,0 +1,173 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Sequence policy and agent that directly output actions via actor network.
+
+These classes are not intended to change as they are generic enough for any
+all-neural actor based agent+policy. All new features are intended to be
+implemented in `actor_network` and `loss_fn`.
+
+# TODO(b/231896343): Update litred docs on how to use these.
+"""
+from typing import Optional, Type
+
+from absl import logging
+import tensorflow as tf
+from tf_agents.agents import data_converter
+from tf_agents.agents import tf_agent
+from tf_agents.networks import network
+from tf_agents.policies import actor_policy
+from tf_agents.trajectories import policy_step
+from tf_agents.trajectories import time_step as ts
+from tf_agents.typing import types
+from tf_agents.utils import nest_utils
+
+
+class SequencePolicy(actor_policy.ActorPolicy):
+  """A policy that directly outputs actions via an actor network."""
+
+  def __init__(self, **kwargs):
+    self._actions = None
+    super().__init__(**kwargs)
+
+  def set_actions(self, actions):
+    self._actor_network.set_actions(actions)
+
+  def get_actor_loss(self):
+    return self._actor_network.get_actor_loss()
+
+  def get_aux_info(self):
+    return self._actor_network.get_aux_info()
+
+  def set_training(self, training):
+    self._training = training
+
+  def _action(self,
+              time_step: ts.TimeStep,
+              policy_state: types.NestedTensor,
+              seed: Optional[types.Seed] = None) -> policy_step.PolicyStep:
+    del seed
+    action, policy_state = self._apply_actor_network(
+        time_step.observation,
+        step_type=time_step.step_type,
+        policy_state=policy_state)
+    info = ()
+    return policy_step.PolicyStep(action, policy_state, info)
+
+  def _distribution(self, time_step, policy_state):
+    current_step = super()._distribution(time_step, policy_state)
+    return current_step
+
+
+class SequenceAgent(tf_agent.TFAgent):
+  """A sequence agent that directly outputs actions via an actor network."""
+
+  def __init__(self,
+               time_step_spec: ts.TimeStep,
+               action_spec: types.NestedTensorSpec,
+               actor_network: Type[network.Network],
+               actor_optimizer: tf.keras.optimizers.Optimizer,
+               policy_cls: Type[actor_policy.ActorPolicy] = SequencePolicy,
+               time_sequence_length: int = 6,
+               debug_summaries: bool = False,
+               **kwargs):
+    self._info_spec = ()
+    self._actor_network = actor_network(  # pytype: disable=missing-parameter  # dynamic-method-lookup
+        input_tensor_spec=time_step_spec.observation,
+        output_tensor_spec=action_spec,
+        policy_info_spec=self._info_spec,
+        train_step_counter=kwargs['train_step_counter'],
+        time_sequence_length=time_sequence_length)
+
+    self._actor_optimizer = actor_optimizer
+    # Train policy is only used for loss and never exported as saved_model.
+    self._train_policy = policy_cls(
+        time_step_spec=time_step_spec,
+        action_spec=action_spec,
+        info_spec=self._info_spec,
+        actor_network=self._actor_network,
+        training=True)
+    collect_policy = policy_cls(
+        time_step_spec=time_step_spec,
+        action_spec=action_spec,
+        info_spec=self._info_spec,
+        actor_network=self._actor_network,
+        training=False)
+    super(SequenceAgent, self).__init__(
+        time_step_spec,
+        action_spec,
+        collect_policy,  # We use the collect_policy as the eval policy.
+        collect_policy,
+        train_sequence_length=time_sequence_length,
+        **kwargs)
+    self._data_context = data_converter.DataContext(
+        time_step_spec=time_step_spec,
+        action_spec=action_spec,
+        info_spec=collect_policy.info_spec,
+        use_half_transition=True)
+    self.as_transition = data_converter.AsHalfTransition(
+        self._data_context, squeeze_time_dim=False)
+    self._debug_summaries = debug_summaries
+
+    num_params = 0
+    for weight in self._actor_network.trainable_weights:
+      weight_params = 1
+      for dim in weight.shape:
+        weight_params *= dim
+      logging.info('%s has %s params.', weight.name, weight_params)
+      num_params += weight_params
+    logging.info('Actor network has %sM params.', round(num_params / 1000000.,
+                                                        2))
+
+  def _train(self, experience: types.NestedTensor,
+             weights: types.Tensor) -> tf_agent.LossInfo:
+    self.train_step_counter.assign_add(1)
+    loss_info = self._loss(experience, weights, training=True)
+    self._apply_gradients(loss_info.loss)
+    return loss_info
+
+  def _apply_gradients(self, loss: types.Tensor):
+    variables = self._actor_network.trainable_weights
+    gradients = tf.gradients(loss, variables)
+    # Skip nan and inf gradients.
+    new_gradients = []
+    for g in gradients:
+      if g is not None:
+        new_g = tf.where(
+            tf.math.logical_or(tf.math.is_inf(g), tf.math.is_nan(g)),
+            tf.zeros_like(g), g)
+        new_gradients.append(new_g)
+      else:
+        new_gradients.append(g)
+    grads_and_vars = list(zip(new_gradients, variables))
+    self._actor_optimizer.apply_gradients(grads_and_vars)
+
+  def _loss(self, experience: types.NestedTensor, weights: types.Tensor,
+            training: bool) -> tf_agent.LossInfo:
+    transition = self.as_transition(experience)
+    time_steps, policy_steps, _ = transition
+    batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0]
+    policy = self._train_policy
+    policy.set_actions(policy_steps.action)
+    policy.set_training(training=training)
+    with tf.name_scope('actor_loss'):
+      policy_state = policy.get_initial_state(batch_size)
+      policy.action(time_steps, policy_state=policy_state)
+      valid_mask = tf.cast(~time_steps.is_last(), tf.float32)
+      loss = valid_mask * policy.get_actor_loss()
+      loss = tf.reduce_mean(loss)
+      policy.set_actions(None)
+      self._actor_network.add_summaries(time_steps.observation,
+                                        policy.get_aux_info(),
+                                        self._debug_summaries, training)
+      return tf_agent.LossInfo(loss=loss, extra=loss)
--- a/sequence_agent_test.py
+++ b/sequence_agent_test.py
@ -0,0 +1,28 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for sequence_agent."""
+from robotics_transformer.sequence_agent_test_set_up import SequenceAgentTestSetUp
+import tensorflow as tf
+from tf_agents.agents import data_converter
+
+
+class SequenceAgentTest(SequenceAgentTestSetUp):
+
+  def testAsTransitionType(self):
+    agent = self.create_agent_and_initialize()
+    self.assertIsInstance(agent.as_transition, data_converter.AsHalfTransition)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/sequence_agent_test_set_up.py
+++ b/sequence_agent_test_set_up.py
@ -0,0 +1,144 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for sequence_agent."""
+from typing import Type
+
+import numpy as np
+from robotics_transformer import sequence_agent
+from tensor2robot.utils import tensorspec_utils
+import tensorflow as tf
+from tf_agents.networks import network
+from tf_agents.policies import policy_saver
+from tf_agents.specs import tensor_spec
+from tf_agents.trajectories import time_step as ts
+
+
+class DummyActorNet(network.Network):
+  """Used for testing SequenceAgent and its subclass."""
+
+  def __init__(self,
+               output_tensor_spec=None,
+               train_step_counter=None,
+               policy_info_spec=None,
+               time_sequence_length=1,
+               use_tcl=False,
+               **kwargs):
+    super().__init__(**kwargs)
+
+  @property
+  def tokens_per_action(self):
+    return 8
+
+  def set_actions(self, actions):
+    self._actions = actions
+
+  def get_actor_loss(self):
+    return self._actor_loss
+
+  def call(self,
+           observations,
+           step_type,
+           network_state,
+           actions=None,
+           training=False):
+    del step_type
+    image = observations['image']
+    tf.expand_dims(tf.reduce_mean(image, axis=-1), -1)
+    actions = tensorspec_utils.TensorSpecStruct(
+        world_vector=tf.constant(1., shape=[1, 3]),
+        rotation_delta=tf.constant(1., shape=[1, 3]),
+        terminate_episode=tf.constant(1, shape=[1, 2]),
+        gripper_closedness_action=tf.constant(1., shape=[1, 1]),
+    )
+    return actions, network_state
+
+  @property
+  def trainable_weights(self):
+    return [tf.Variable(1.0)]
+
+
+class SequenceAgentTestSetUp(tf.test.TestCase):
+  """Defines spec for testing SequenceAgent and its subclass, tests create."""
+
+  def setUp(self):
+    super().setUp()
+    self._action_spec = tensorspec_utils.TensorSpecStruct()
+    self._action_spec.world_vector = tensor_spec.BoundedTensorSpec(
+        (3,), dtype=tf.float32, minimum=-1., maximum=1., name='world_vector')
+
+    self._action_spec.rotation_delta = tensor_spec.BoundedTensorSpec(
+        (3,),
+        dtype=tf.float32,
+        minimum=-np.pi / 2,
+        maximum=np.pi / 2,
+        name='rotation_delta')
+
+    self._action_spec.gripper_closedness_action = tensor_spec.BoundedTensorSpec(
+        (1,),
+        dtype=tf.float32,
+        minimum=-1.,
+        maximum=1.,
+        name='gripper_closedness_action')
+    self._action_spec.terminate_episode = tensor_spec.BoundedTensorSpec(
+        (2,), dtype=tf.int32, minimum=0, maximum=1, name='terminate_episode')
+
+    state_spec = tensorspec_utils.TensorSpecStruct()
+    state_spec.image = tensor_spec.BoundedTensorSpec([256, 320, 3],
+                                                     dtype=tf.float32,
+                                                     name='image',
+                                                     minimum=0.,
+                                                     maximum=1.)
+    state_spec.natural_language_embedding = tensor_spec.TensorSpec(
+        shape=[512], dtype=tf.float32, name='natural_language_embedding')
+    self._time_step_spec = ts.time_step_spec(observation_spec=state_spec)
+
+    self.sequence_agent_cls = sequence_agent.SequenceAgent
+
+  def create_agent_and_initialize(self,
+                                  actor_network: Type[
+                                      network.Network] = DummyActorNet,
+                                  **kwargs):
+    """Creates the agent and initialize it."""
+    agent = self.sequence_agent_cls(
+        time_step_spec=self._time_step_spec,
+        action_spec=self._action_spec,
+        actor_network=actor_network,
+        actor_optimizer=tf.keras.optimizers.Adam(),
+        train_step_counter=tf.compat.v1.train.get_or_create_global_step(),
+        **kwargs)
+    agent.initialize()
+    return agent
+
+  def testCreateAgent(self):
+    """Creates the Agent and save the agent.policy."""
+    agent = self.create_agent_and_initialize()
+    self.assertIsNotNone(agent.policy)
+
+    policy_model_saver = policy_saver.PolicySaver(
+        agent.policy,
+        train_step=tf.compat.v2.Variable(
+            0,
+            trainable=False,
+            dtype=tf.int64,
+            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
+            shape=()),
+        input_fn_and_spec=None)
+    save_options = tf.saved_model.SaveOptions(
+        experimental_io_device='/job:localhost',
+        experimental_custom_gradients=False)
+    policy_model_saver.save('/tmp/unittest/policy/0', options=save_options)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/tokenizers/init.py
+++ b/tokenizers/init.py
@ -0,0 +1,13 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/tokenizers/action_tokenizer.py
+++ b/tokenizers/action_tokenizer.py
@ -0,0 +1,157 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A simple action tokenizer used with Robotics Transformer 1.
+
+As an example, if an action is:
+terminate = [0, 1]
+world_vector = [0.9, 0.8, -0.3]
+rotation_delta = [-0.1, 0.2, .6]
+gripper_closedness = 0.9
+
+Then we build a sequence of tokens of length 8 [one for each dimension].
+The int32 type action dimensions are already assumed discrete and tokenized,
+the float dimensions are bucketed according to the specs min and max. Each
+dimension has 'vocab_size' buckets.
+
+Currently, this tokenizer assumes one action spec and it is highly recommended
+to specify the 'action_order', eg [terminate, world_vector, rotation_delta,
+gripper_closedness]. Since after tokenization you lose that information, this
+will be useful for debugging. Actions may also be subselected for prediction,
+since not all actions are needed in the action_order.
+"""
+from typing import Optional
+
+from tensor2robot.utils import tensorspec_utils
+import tensorflow as tf
+
+
+class RT1ActionTokenizer:
+  """Tokenizes based on vocab size."""
+
+  def __init__(self,
+               action_spec: tensorspec_utils.TensorSpecStruct,
+               vocab_size: int,
+               action_order: Optional[list[str]] = None):
+    """Instantiates an RT1ActionTokenizer.
+
+    Args:
+      action_spec: Tensor spec of the expected action tensor.
+      vocab_size: Number of buckets to discretize action to.
+      action_order: Order of the action names, used to discern the order of
+        tokenized actions to detokenize and assemble back to action tensor
+    """
+    self._action_spec = action_spec
+    self._vocab_size = vocab_size
+    if action_order is None:
+      self._action_order = self._action_spec.keys()
+    else:
+      for action in action_order:
+        if action not in self._action_spec.keys():
+          raise ValueError('actions: %s not found in action_spec: %s' %
+                           (action, action_spec.keys()))
+        assert action in self._action_spec.keys()
+      self._action_order = action_order
+    self._tokens_per_action = 0
+    for action in self._action_order:
+      action_shape = self._action_spec[action].shape
+      if len(action_shape) != 1:
+        raise ValueError(
+            'Only action shapes with single dimension supported, got %s' %
+            action_shape)
+      if self._action_spec[action].dtype == tf.int32:
+        # Int32 actions are already assumed to be tokens.
+        self._tokens_per_action += 1
+      else:
+        self._tokens_per_action += action_shape[0]
+
+    # We measure # of action tokens in two different way. One is by checking
+    # from action_order (above) and the other is by looping through the
+    # action spec (below). We aseert the # of action tokens are the same
+    # calculated by these two ways. This will assure action_order is correctly
+    # configured, otherwise, it will through an error in the assert.
+    num_action_token = 0
+    for spec in self._action_spec.values():
+      if spec.dtype == tf.int32:
+        num_action_token += 1
+      else:
+        num_action_token += spec.shape[-1]
+    tf.debugging.assert_equal(num_action_token, self._tokens_per_action)
+
+  @property
+  def tokens_per_action(self) -> int:
+    return self._tokens_per_action
+
+  @property
+  def action_spec(self) -> tensorspec_utils.TensorSpecStruct:
+    return self._action_spec
+
+  @property
+  def action_order(self) -> list[str]:
+    return self._action_order
+
+  def tokenize(self, action: tensorspec_utils.TensorSpecStruct) -> tf.Tensor:
+    """Tokenizes an action."""
+    action_tokens = []
+    for k in self._action_order:
+      a = action[k]  # a is [batch, actions_size]
+      spec = self._action_spec[k]
+      if spec.dtype == tf.int32:
+        # Int32 actions are already assumed to be tokens, assume it is smaller
+        # than the vocab size, so all we need to do is pad zeros.
+        tf.debugging.assert_equal(1, tf.reduce_sum(a, axis=-1))
+        # extract the token [batch, 1]
+        token = tf.argmax(a, axis=-1, output_type=tf.int32)
+        tf.debugging.assert_less(token, self._vocab_size)
+        # Add a seq dimension [batch, 1]
+        token = tf.expand_dims(token, axis=-1)
+      else:
+        a = tf.clip_by_value(a, spec.minimum, spec.maximum)
+        # Normalize the action [batch, actions_size]
+        token = (a - spec.minimum) / (spec.maximum - spec.minimum)
+        # Bucket and discretize the action to vocab_size, [batch, actions_size]
+        token = tf.cast(token * (self._vocab_size - 1), tf.int32)
+      action_tokens.append(token)
+    # Append all actions, [batch, all_actions_size]
+    action_tokens = tf.concat(action_tokens, axis=-1)
+    return action_tokens
+
+  def detokenize(self,
+                 action_tokens: tf.Tensor) -> tensorspec_utils.TensorSpecStruct:
+    """Detokenizes an action."""
+    action = tensorspec_utils.TensorSpecStruct()
+    token_index = 0
+    for k in self._action_order:
+      spec = self._action_spec[k]
+      action_dim = spec.shape[0]
+      if spec.dtype == tf.int32:
+        # Int32 actions are already assumed to be tokens.
+        action[k] = action_tokens[..., token_index]
+        # A poor model may output tokens outside the allowed range, in that case
+        # set them to a default value, the 0 token in this case.
+        outside_range = tf.greater_equal(action[k], action_dim)
+        action[k] = tf.where(outside_range, tf.zeros_like(action[k]), action[k])
+        action[k] = tf.one_hot(
+            action[k], depth=action_dim, axis=-1, dtype=tf.int32)
+        token_index += 1
+      else:
+        actions = []
+        for _ in range(action_dim):
+          a = action_tokens[..., token_index:token_index + 1]
+          a = tf.cast(a, tf.float32)
+          a = a / (self._vocab_size - 1)
+          a = (a * (spec.maximum - spec.minimum)) + spec.minimum
+          actions.append(a)
+          token_index += 1
+        action[k] = tf.concat(actions, axis=-1)
+    return action
--- a/tokenizers/action_tokenizer_test.py
+++ b/tokenizers/action_tokenizer_test.py
@ -0,0 +1,191 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for action_tokenizer."""
+import numpy as np
+from robotics_transformer.tokenizers import action_tokenizer
+from tensor2robot.utils import tensorspec_utils
+import tensorflow as tf
+from tf_agents.specs import tensor_spec
+
+
+class ActionTokenizerTest(tf.test.TestCase):
+
+  def testTokenize_int32(self):
+    action_spec = tensorspec_utils.TensorSpecStruct()
+    action_spec.terminate_episode = tensor_spec.BoundedTensorSpec(
+        (2,), dtype=tf.int32, minimum=0, maximum=1, name='terminate_episode')
+    tokenizer = action_tokenizer.RT1ActionTokenizer(action_spec, vocab_size=10)
+    self.assertEqual(1, tokenizer.tokens_per_action)
+    action = tensorspec_utils.TensorSpecStruct(terminate_episode=[0, 1])
+    action_tokens = tokenizer.tokenize(action)
+    self.assertEqual([1], action_tokens.numpy())
+
+  def testTokenize_int32_not_one_hot(self):
+    action_spec = tensorspec_utils.TensorSpecStruct()
+    action_spec.terminate_episode = tensor_spec.BoundedTensorSpec(
+        (2,), dtype=tf.int32, minimum=0, maximum=1, name='terminate_episode')
+    tokenizer = action_tokenizer.RT1ActionTokenizer(action_spec, vocab_size=10)
+    self.assertEqual(1, tokenizer.tokens_per_action)
+    action = tensorspec_utils.TensorSpecStruct(terminate_episode=[1, 8])
+    with self.assertRaises(tf.errors.InvalidArgumentError):
+      tokenizer.tokenize(action)
+
+  def testDetokenize_int32(self):
+    action_spec = tensorspec_utils.TensorSpecStruct()
+    action_spec.terminate_episode = tensor_spec.BoundedTensorSpec(
+        (2,), dtype=tf.int32, minimum=0, maximum=1, name='terminate_episode')
+    tokenizer = action_tokenizer.RT1ActionTokenizer(action_spec, vocab_size=10)
+    # 0 token should become a one hot: [1, 0]
+    action = tokenizer.detokenize(tf.constant([0], dtype=tf.int32))
+    self.assertSequenceEqual([1, 0], list(action['terminate_episode'].numpy()))
+    # 1 token should become a one hot: [0, 1]
+    action = tokenizer.detokenize(tf.constant([1], dtype=tf.int32))
+    self.assertSequenceEqual([0, 1], list(action['terminate_episode'].numpy()))
+    # OOV 3 token should become a default one hot: [1, 0]
+    action = tokenizer.detokenize(tf.constant([3], dtype=tf.int32))
+    self.assertSequenceEqual([1, 0], list(action['terminate_episode'].numpy()))
+
+  def testTokenize_float(self):
+    action_spec = tensorspec_utils.TensorSpecStruct()
+    action_spec.world_vector = tensor_spec.BoundedTensorSpec(
+        (3,), dtype=tf.float32, minimum=-1., maximum=1., name='world_vector')
+    tokenizer = action_tokenizer.RT1ActionTokenizer(action_spec, vocab_size=10)
+    self.assertEqual(3, tokenizer.tokens_per_action)
+    action = tensorspec_utils.TensorSpecStruct(world_vector=[0.1, 0.5, -0.8])
+    action_tokens = tokenizer.tokenize(action)
+    self.assertSequenceEqual([4, 6, 0], list(action_tokens.numpy()))
+
+  def testTokenize_float_with_time_dimension(self):
+    action_spec = tensorspec_utils.TensorSpecStruct()
+    action_spec.world_vector = tensor_spec.BoundedTensorSpec(
+        (3,), dtype=tf.float32, minimum=-1., maximum=1., name='world_vector')
+    tokenizer = action_tokenizer.RT1ActionTokenizer(action_spec, vocab_size=10)
+    self.assertEqual(3, tokenizer.tokens_per_action)
+    batch_size = 2
+    time_dimension = 3
+    action = tensorspec_utils.TensorSpecStruct(
+        world_vector=tf.constant(
+            [[0.1, 0.5, -0.8], [0.1, 0.5, -0.8], [0.1, 0.5, -0.8],
+             [0.1, 0.5, -0.8], [0.1, 0.5, -0.8], [0.1, 0.5, -0.8]],
+            shape=[batch_size, time_dimension, tokenizer.tokens_per_action]))
+    action_tokens = tokenizer.tokenize(action)
+    self.assertSequenceEqual(
+        [batch_size, time_dimension, tokenizer.tokens_per_action],
+        action_tokens.shape.as_list())
+
+  def testTokenize_float_at_limits(self):
+    minimum = -1.
+    maximum = 1.
+    vocab_size = 10
+    action_spec = tensorspec_utils.TensorSpecStruct()
+    action_spec.world_vector = tensor_spec.BoundedTensorSpec(
+        (2,),
+        dtype=tf.float32,
+        minimum=minimum,
+        maximum=maximum,
+        name='world_vector')
+    tokenizer = action_tokenizer.RT1ActionTokenizer(
+        action_spec, vocab_size=vocab_size)
+    self.assertEqual(2, tokenizer.tokens_per_action)
+    action = tensorspec_utils.TensorSpecStruct(world_vector=[minimum, maximum])
+    action_tokens = tokenizer.tokenize(action)
+    # Minimum value will go to 0
+    # Maximum value witll go to vocab_size-1
+    self.assertSequenceEqual([0, vocab_size - 1], list(action_tokens.numpy()))
+
+  def testTokenize_invalid_action_spec_shape(self):
+    action_spec = tensorspec_utils.TensorSpecStruct()
+    action_spec.world_vector = tensor_spec.BoundedTensorSpec(
+        (2, 2), dtype=tf.float32, minimum=1, maximum=-1, name='world_vector')
+    with self.assertRaises(ValueError):
+      action_tokenizer.RT1ActionTokenizer(action_spec, vocab_size=10)
+
+  def testTokenizeAndDetokenizeIsEqual(self):
+    action_spec = tensorspec_utils.TensorSpecStruct()
+
+    action_spec.world_vector = tensor_spec.BoundedTensorSpec(
+        (3,), dtype=tf.float32, minimum=-1., maximum=1., name='world_vector')
+
+    action_spec.rotation_delta = tensor_spec.BoundedTensorSpec(
+        (3,),
+        dtype=tf.float32,
+        minimum=-np.pi / 2.,
+        maximum=np.pi / 2.,
+        name='rotation_delta')
+
+    action_spec.gripper_closedness_action = tensor_spec.BoundedTensorSpec(
+        (1,),
+        dtype=tf.float32,
+        minimum=-1.,
+        maximum=1.,
+        name='gripper_closedness_action')
+
+    num_sub_action_space = 2
+    action_spec.terminate_episode = tensor_spec.BoundedTensorSpec(
+        (num_sub_action_space,),
+        dtype=tf.int32,
+        minimum=0,
+        maximum=1,
+        name='terminate_episode')
+
+    tokenizer = action_tokenizer.RT1ActionTokenizer(
+        action_spec,
+        vocab_size=1024,
+        action_order=[
+            'terminate_episode', 'world_vector', 'rotation_delta',
+            'gripper_closedness_action'
+        ])
+    self.assertEqual(8, tokenizer.tokens_per_action)
+
+    # Repeat the following test N times with fuzzy inputs.
+    n_repeat = 10
+    for _ in range(n_repeat):
+      action = tensorspec_utils.TensorSpecStruct(
+          world_vector=np.random.uniform(low=-1., high=1.0, size=3),
+          rotation_delta=np.random.uniform(
+              low=-np.pi / 2., high=np.pi / 2., size=3),
+          gripper_closedness_action=np.random.uniform(low=0., high=1.0, size=1),
+          terminate_episode=[0, 1])
+      action_tokens = tokenizer.tokenize(action)
+      policy_action = tokenizer.detokenize(action_tokens)
+
+      for k in action:
+        self.assertSequenceAlmostEqual(
+            action[k], policy_action[k].numpy(), places=2)
+
+      # Repeat the test with batched actions
+      batched_action = tensorspec_utils.TensorSpecStruct(
+          world_vector=[
+              np.random.uniform(low=-1., high=1.0, size=3),
+              np.random.uniform(low=-1., high=1.0, size=3)
+          ],
+          rotation_delta=[
+              np.random.uniform(low=-np.pi / 2., high=np.pi / 2., size=3),
+              np.random.uniform(low=-np.pi / 2., high=np.pi / 2., size=3)
+          ],
+          gripper_closedness_action=[
+              np.random.uniform(low=0., high=1.0, size=1),
+              np.random.uniform(low=0., high=1.0, size=1)
+          ],
+          terminate_episode=[[0, 1], [1, 0]])
+      action_tokens = tokenizer.tokenize(batched_action)
+      policy_action = tokenizer.detokenize(action_tokens)
+
+      for k in batched_action:
+        for a, policy_a in zip(batched_action[k], policy_action[k].numpy()):
+          self.assertSequenceAlmostEqual(a, policy_a, places=2)
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/tokenizers/image_tokenizer.py
+++ b/tokenizers/image_tokenizer.py
@ -0,0 +1,112 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A FiLM Efficientnet contextual image tokenizer used in Robotics Transformer 1.
+"""
+from typing import Optional
+from robotics_transformer.film_efficientnet import pretrained_efficientnet_encoder
+from robotics_transformer.tokenizers import token_learner
+import tensorflow as tf
+
+
+class RT1ImageTokenizer(tf.keras.layers.Layer):
+  """Tokenizes based on vocab size."""
+
+  def __init__(self,
+               embedding_output_dim: int,
+               use_token_learner: bool = False,
+               num_tokens: int = 8,
+               **kwargs):
+    """Instantiates a RT1ImageTokenizer.
+
+    Args:
+      embedding_output_dim: The output size of the tokens.
+      use_token_learner: Whether to use token learner. See
+        https://arxiv.org/abs/2106.11297
+      num_tokens: Relevant only for token learner - the number of learned
+        tokens.
+      **kwargs: Keyword arguments to base class.
+    """
+    super().__init__(**kwargs)
+    self._embedding_output_dim = embedding_output_dim
+
+    self._tokenizer = pretrained_efficientnet_encoder.EfficientNetEncoder(
+        pooling=False, early_film=True)
+
+    self._use_token_learner = use_token_learner
+    if self._use_token_learner:
+      self._num_tokens = num_tokens
+      self._token_learner = token_learner.TokenLearnerModule(
+          num_tokens=self._num_tokens)
+
+  @property
+  def tokens_per_context_image(self) -> int:
+    if self._use_token_learner:
+      num_tokens = self._num_tokens
+    else:
+      num_tokens = 81
+    return num_tokens
+
+  def __call__(self,
+               image: tf.Tensor,
+               context: Optional[tf.Tensor] = None,
+               training: bool = False) -> tf.Tensor:
+    """Gets image tokens.
+
+    Args:
+      image: Images of shape (b, t, h, w, 3) to tokenize.
+      context: An optional context vector (e.g., a natural language embedding).
+        Expected to have shape (b, t, embedding_dim).
+      training: Whether or not we are in training mode.
+
+    Returns:
+      tokens: has shape (batch, t, num_tokens_per_timestep, embedding_dim)
+    """
+    image_shape = tf.shape(image)
+    b = image_shape[0]
+    t = image_shape[1]
+    h = image_shape[2]
+    w = image_shape[3]
+    c = image_shape[4]
+
+    # Fold the time axis into the batch axis.
+    image = tf.reshape(image, [b * t, h, w, c])
+    if context is not None:
+      context_rank = tf.rank(context)
+      assertion = tf.Assert(context_rank == 3, data=[context_rank])
+      with tf.control_dependencies([assertion]):
+        context = tf.reshape(context, [b * t, tf.shape(context)[-1]])
+    tokens = self.get_image_embeddings(image, context, training)
+    if self._use_token_learner:
+      tokens = self._token_learner(tokens, training)
+    # Unflatten the time axis, which was previously flattened into the batch.
+    tokens = tf.reshape(tokens, [b, t, tf.shape(tokens)[1], -1])
+    return tokens
+
+  def get_image_embeddings(self,
+                           image: tf.Tensor,
+                           context: Optional[tf.Tensor],
+                           training: bool = False) -> tf.Tensor:
+    """Gets embeddings from image.
+
+    Args:
+      image: Expected to be float32 in range [0, 1] with shape (b, h, w, 3).
+      context: Expected to be float32 with shape (b, embedding_dim)
+      training: Whether or not we are in training mode.
+
+    Returns:
+      tokens of shape (b, num_tokens, emedding_dim)
+    """
+    image_tokens = self._tokenizer(image, context=context, training=training)
+    image_tokens = tf.reshape(image_tokens, [-1, 81, 512])
+    return image_tokens
--- a/tokenizers/image_tokenizer_test.py
+++ b/tokenizers/image_tokenizer_test.py
@ -0,0 +1,46 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for image_tokenizer."""
+from absl.testing import parameterized
+from robotics_transformer.tokenizers import image_tokenizer
+import tensorflow as tf
+
+
+class ImageTokenizerTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('sample_image', 512, 224, False, 8),
+      ('sample_image_token_learner', 512, 224, True, 8))
+  def testTokenize(self, output_dim, image_resolution, use_token_learner,
+                   num_tokens):
+    batch = 1
+    seq = 2
+    tokenizer = image_tokenizer.RT1ImageTokenizer(
+        embedding_output_dim=output_dim,
+        use_token_learner=use_token_learner,
+        num_tokens=num_tokens)
+
+    image = tf.random.normal(
+        shape=(batch, seq, image_resolution, image_resolution, 3))
+    image = tf.clip_by_value(image, 0.0, 1.0)
+    context_vector = tf.random.uniform((batch, seq, 512))
+    image_tokens = tokenizer(image, context_vector)
+    if use_token_learner:
+      self.assertEqual(image_tokens.shape, [batch, seq, num_tokens, 512])
+    else:
+      self.assertEqual(image_tokens.shape, [batch, seq, 81, 512])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/tokenizers/token_learner.py
+++ b/tokenizers/token_learner.py
@ -0,0 +1,128 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF implementation of Token Learner(Ryoo et al 2021)."""
+
+import functools
+from typing import Optional, Sequence, Union
+import numpy as np
+import tensorflow as tf
+
+
+def gelu(x: float) -> float:
+  return 0.5 * x * (1 +
+                    tf.tanh(tf.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))
+
+
+def _maybe_dropout(rate: float = 0.0, name: str = "dropout"):
+  """Helper function to return dropout layer if rate is non zero."""
+  if rate:
+    return tf.keras.layers.Dropout(rate, name=name)
+  return lambda x, *args: x  # Does nothing to x.
+
+
+class MlpBlock(tf.keras.layers.Layer):
+  """Transformer MLP / feed-forward block."""
+
+  def __init__(self,
+               *,
+               mlp_dim: int,
+               out_dim: Optional[int] = None,
+               kernel_init: Optional[tf.keras.initializers.Initializer] = tf
+               .keras.initializers.glorot_uniform(),
+               bias_init: Optional[tf.keras.initializers.Initializer] = tf.keras
+               .initializers.RandomNormal(stddev=1e-6),
+               dropout_rate: float = 0.1,
+               **kwargs):
+    """Initializer for the MLP Block.
+
+    This computes outer_dense(gelu(hidden_dense(input))), with dropout
+    applied as necessary.
+
+    Note: Especially outside a keras workflow, make sure to call layer.build
+
+    Args:
+      mlp_dim: The dimension of the inner representation (output of hidden
+        layer). Usually larger than the input/output dim.
+      out_dim: The output dimension of the block. If None, the model output dim
+        is equal to the input dim (usually desired)
+      kernel_init: Initializer for dense kernels, used for both dense layers.
+      bias_init: Initializer for dense biases, used for both dense layers.
+      dropout_rate: Dropout rate to be applied after dense ( & activation)
+      **kwargs: Other keyword args passed to the tf.keras.layers.Layer
+        constructor e.g. the name
+    """
+    super().__init__(**kwargs)
+    self._out_dim = out_dim
+    self._hidden_dropout = _maybe_dropout(dropout_rate)
+    self._output_dropout = _maybe_dropout(dropout_rate)
+    self._hidden_layer = tf.keras.layers.Dense(
+        mlp_dim,
+        activation=gelu,
+        kernel_initializer=kernel_init,
+        bias_initializer=bias_init,
+        name="hidden_dense")
+
+    # If out_dim is None, infer out_dim = input_dim at self.build()
+    self._output_layer = functools.partial(
+        tf.keras.layers.Dense,
+        kernel_initializer=kernel_init,
+        bias_initializer=bias_init,
+        name="final_dense")
+
+  def build(self, input_shape: Sequence[int]):
+    out_dim = self._out_dim or input_shape[-1]
+    self._output_layer = self._output_layer(units=out_dim)
+    super().build(input_shape)
+
+  def call(self,
+           inputs: tf.Tensor,
+           *,
+           is_training: Union[bool, tf.Tensor] = False) -> tf.Tensor:
+    """Applies Transformer MlpBlock module."""
+    x = self._hidden_layer(inputs)
+    x = self._hidden_dropout(x, is_training)
+    x = self._output_layer(x)
+    x = self._output_dropout(x, is_training)
+    return x
+
+
+class TokenLearnerModule(tf.keras.layers.Layer):
+  """TokenLearner module V1.1 (https://arxiv.org/abs/2106.11297)."""
+
+  def __init__(self,
+               num_tokens: int,
+               bottleneck_dim: int = 64,
+               dropout_rate: float = 0.):
+    super().__init__()
+
+    self.mlp = MlpBlock(
+        mlp_dim=bottleneck_dim, out_dim=num_tokens, dropout_rate=dropout_rate)
+    self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+
+  def call(self, inputs: tf.Tensor, training: bool = False) -> tf.Tensor:
+    if len(inputs.shape) == 4:
+      bs, h, w, c = inputs.shape
+      inputs = tf.reshape(inputs, [bs, h * w, c])
+
+    selected = self.layernorm(inputs)
+
+    selected = self.mlp(
+        selected, is_training=training)  # Shape: [bs, h*w, n_token].
+
+    selected = tf.transpose(selected, [0, 2, 1])  # Shape: [bs, n_token, h*w].
+    selected = tf.nn.softmax(selected, axis=-1)
+
+    feat = tf.einsum("...si,...id->...sd", selected, inputs)
+
+    return feat  # Shape: [bs, n_token, c]
--- a/tokenizers/token_learner_test.py
+++ b/tokenizers/token_learner_test.py
@ -0,0 +1,37 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for token_learner."""
+from absl.testing import parameterized
+from robotics_transformer.tokenizers import token_learner
+import tensorflow as tf
+
+
+class TokenLearnerTest(parameterized.TestCase):
+
+  @parameterized.named_parameters(('sample_input', 512, 8))
+  def testTokenLearner(self, embedding_dim, num_tokens):
+    batch = 1
+    seq = 2
+    token_learner_layer = token_learner.TokenLearnerModule(
+        num_tokens=num_tokens)
+
+    inputvec = tf.random.normal(shape=(batch * seq, 81, embedding_dim))
+
+    learnedtokens = token_learner_layer(inputvec)
+    self.assertEqual(learnedtokens.shape,
+                     [batch * seq, num_tokens, embedding_dim])
+
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/transformer.py
+++ b/transformer.py
@ -0,0 +1,169 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RT1 decoder transformer.
+
+Copied from:
+https://www.tensorflow.org/text/tutorials/transformer#decoder
+"""
+from typing import Tuple, Union
+
+import tensorflow as tf
+
+
+class _TransformerLayer(tf.keras.layers.Layer):
+  """A single transformer block."""
+
+  def __init__(self,
+               layer_size: int = 4096,
+               num_heads: int = 8,
+               feed_forward_size: int = 512,
+               dropout_rate: float = 0.1,
+               return_attention_scores: bool = False):
+    """Creates a Transformer layer.
+
+    Args:
+      layer_size: Size of the multiple head attention layer.
+      num_heads: Number of heads for the multiple head attention layer.
+      feed_forward_size: Dimensionality of the feed_forward layer.
+      dropout_rate: Dropout rate.
+      return_attention_scores: Return attention scores.
+    """
+    super(_TransformerLayer, self).__init__()
+
+    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+    self.mha1 = tf.keras.layers.MultiHeadAttention(
+        key_dim=layer_size, num_heads=num_heads, dropout=dropout_rate)
+    self.ff = tf.keras.layers.Dense(feed_forward_size)
+    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
+    self.dropout_ff = tf.keras.layers.Dropout(dropout_rate)
+    self._return_attention_scores = return_attention_scores
+
+  def call(self, x: tf.Tensor, attention_mask: tf.Tensor,
+           training: bool) -> Tuple[tf.Tensor, Union[tf.Tensor, None]]:
+    """Calls the layer.
+
+    Args:
+      x: Input Tensor of shape `(B, T, dim)`.
+      attention_mask: a boolean mask of shape `(B, T, T)`, that prevents
+        attention to certain positions. The boolean mask specifies which query
+        elements can attend to which key elements, 1 indicates attention and 0
+        indicates no attention. Broadcasting can happen for the missing batch
+        dimensions and the head dimension.
+      training: Python boolean indicating whether the layer should behave in
+        training mode (adding dropout) or in inference mode (no dropout).
+
+    Returns:
+      y: Output Tensor of shape `(B, T, dim)`. Also return the attention scores
+      of shape `(B, T, dim)` or None.
+    """
+    x1 = self.layernorm1(x)
+    mha_results = self.mha1(
+        query=x1,
+        key=x1,
+        value=x1,
+        attention_mask=attention_mask,
+        return_attention_scores=self._return_attention_scores,
+        training=training)
+    if self._return_attention_scores:
+      x1, score = mha_results
+    else:
+      x1, score = mha_results, None
+
+    x = x + x1
+
+    y = self.layernorm2(x)
+    ff_y = self.ff(y)
+    ff_y = self.dropout_ff(ff_y, training=training)
+    x = x + ff_y
+    return x, score
+
+
+class Transformer(tf.keras.layers.Layer):
+  """A decoder only transformer."""
+
+  def __init__(self,
+               num_layers: int = 1,
+               layer_size: int = 4096,
+               num_heads: int = 8,
+               feed_forward_size: int = 512,
+               dropout_rate: float = 0.1,
+               vocab_size: int = 256,
+               return_attention_scores: bool = False):
+    """Creates a transformer.
+
+    Args:
+      num_layers: Number of transformer layers.
+      layer_size: Size of the multiple head attention layer.
+      num_heads: Number of heads for the multiple head attention layer.
+      feed_forward_size: Dimensionality of the feed_forward layer.
+      dropout_rate: Dropout rate.
+      vocab_size: Dimensionality of tokens from the output layer.
+      return_attention_scores: Return attention scores.
+    """
+    super(Transformer, self).__init__()
+
+    self._layers = [
+        _TransformerLayer(  # pylint: disable=g-complex-comprehension
+            layer_size=layer_size,
+            num_heads=num_heads,
+            feed_forward_size=feed_forward_size,
+            dropout_rate=dropout_rate,
+            return_attention_scores=return_attention_scores)
+        for _ in range(num_layers)
+    ]
+    self._token_emb = tf.keras.layers.Dense(feed_forward_size)
+    self._position_emb = tf.keras.layers.Dense(feed_forward_size)
+    self._output_tokens = tf.keras.layers.Dense(vocab_size)
+
+  def call(
+      self,
+      x: tf.Tensor,
+      training: bool,
+      attention_mask: tf.Tensor,
+  ) -> Union[tf.Tensor, Tuple[tf.Tensor, list[tf.Tensor]]]:
+    """Calls the layer.
+
+    Args:
+      x: Input Tensor of shape `(B, T, dim)`.
+      training: Python boolean indicating whether the layer should behave in
+        training mode (adding dropout) or in inference mode (no dropout).
+      attention_mask: a boolean mask of shape `(B, T, T)`, that prevents
+        attention to certain positions. The boolean mask specifies which query
+        elements can attend to which key elements, 1 indicates attention and 0
+        indicates no attention. Broadcasting can happen for the missing batch
+        dimensions and the head dimension.
+
+    Returns:
+      x: Output Tensor of shape `(B, T, vocab_size)`. If
+      `return_attention_scores`, also return attention scores of
+      a list of `layer` of elements with shape `(B, T, dim)`.
+    """
+
+    seq_len = tf.shape(x)[1]
+    batch_size = tf.shape(x)[0]
+
+    positions = tf.one_hot(
+        tf.tile(tf.expand_dims(tf.range(0, seq_len, 1), 0), [batch_size, 1]),
+        seq_len)
+
+    x = self._token_emb(x)
+    x += self._position_emb(positions)
+    scores = []
+
+    for layer in self._layers:
+      x, score = layer(x, attention_mask=attention_mask, training=training)
+      if score is not None:
+        scores.append(score)
+    x = self._output_tokens(x)
+    return x, scores
--- a/transformer_network.py
+++ b/transformer_network.py
@ -0,0 +1,689 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tensorflow based methods for sequence agents."""
+from typing import Optional, Tuple, Union, Any
+
+from absl import logging
+import numpy as np
+
+from robotics_transformer import transformer
+from robotics_transformer.film_efficientnet import preprocessors
+from robotics_transformer.tokenizers import action_tokenizer
+from robotics_transformer.tokenizers import image_tokenizer
+
+from tensor2robot.utils import tensorspec_utils
+import tensorflow as tf
+from tf_agents.networks import network
+from tf_agents.specs import tensor_spec
+from tf_agents.utils import nest_utils
+
+
+class TransformerNetwork(network.Network):
+  """A transformer based actor network."""
+
+  def __init__(
+      self,
+      input_tensor_spec: tensorspec_utils.TensorSpecStruct,
+      output_tensor_spec: tensorspec_utils.TensorSpecStruct,
+      train_step_counter: int = 0,
+      vocab_size: int = 256,
+      token_embedding_size: int = 512,
+      num_layers: int = 1,
+      layer_size: int = 4096,
+      num_heads: int = 8,
+      feed_forward_size: int = 512,
+      dropout_rate: float = 0.1,
+      time_sequence_length: int = 1,
+      crop_size: int = 236,
+      policy_info_spec: Optional[dict[Any,
+                                      tensor_spec.BoundedTensorSpec]] = None,
+      action_order: Optional[list[str]] = None,
+      use_token_learner: Optional[bool] = True,
+      return_attention_scores: bool = False,
+      **kwargs):
+    """Creates a transformer network.
+
+    Args:
+      input_tensor_spec: Nested list/tuple/dict of TensorSpecs, describing the
+        shape of input tensor.
+      output_tensor_spec: Nested list/tuple/dict of TensorSpecs, describing the
+        shape of output tensor.
+      train_step_counter: Counter for number of steps.
+      vocab_size: Dimensionality of tokens from the output layer.
+      token_embedding_size: Dimensionality of tokens from the embedding layer.
+      num_layers: Number of transformer layers.
+      layer_size: Size of the multiple head attention layer.
+      num_heads: Number of heads for the multiple head attention layer.
+      feed_forward_size: Dimensionality of the feed_forward layer.
+      dropout_rate: Dropout rate.
+      time_sequence_length: Length of the time sequence.
+      crop_size: Height and width of the square crop, where original image will
+        be padded to allow full field of view to be extracted.
+      policy_info_spec: Spec on return value given return type of the return
+        tokenizer.
+      action_order: Order of actions for the action tokenizer.
+      use_token_learner: Whether to use token learner. See
+        https://arxiv.org/abs/2106.11297
+      return_attention_scores: show attention scores in tensorboard.
+      **kwargs: Keyword parameter arguments.
+    """
+    self._input_tensor_spec = input_tensor_spec
+    self._output_tensor_spec = output_tensor_spec
+    self._train_step_counter = train_step_counter
+    self._actions = None
+    self._returns = None
+    self._vocab_size = vocab_size
+    self._token_embedding_size = token_embedding_size
+    self._time_sequence_length = time_sequence_length
+    self._crop_size = crop_size
+
+    self._transformer = transformer.Transformer(
+        num_layers=num_layers,
+        layer_size=layer_size,
+        num_heads=num_heads,
+        feed_forward_size=feed_forward_size,
+        dropout_rate=dropout_rate,
+        vocab_size=self._vocab_size,
+        return_attention_scores=return_attention_scores)
+
+    # create tokenizers
+    self._image_tokenizer = image_tokenizer.RT1ImageTokenizer(
+        embedding_output_dim=self._token_embedding_size,
+        use_token_learner=use_token_learner)
+    self._action_tokenizer = action_tokenizer.RT1ActionTokenizer(
+        output_tensor_spec,
+        vocab_size=self._vocab_size,
+        action_order=action_order)
+
+    self._tokens_per_action = self._action_tokenizer.tokens_per_action
+    self._tokens_per_context_image = self._image_tokenizer.tokens_per_context_image
+    # generate loss and attention masks
+    self._generate_masks()
+
+    # define mappings to token embedding size
+    self._action_token_emb = tf.keras.layers.Dense(self._token_embedding_size)
+
+    # define loss function
+    self._loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
+        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
+    self._attention_scores = []
+    self._use_token_learner = use_token_learner
+
+    super(TransformerNetwork, self).__init__(
+        input_tensor_spec=input_tensor_spec, **kwargs)
+    self._state_spec = {
+        # Force this to be 4 dimension due to b/254902773.
+        # Otherwise can be dimension 3.
+        'context_image_tokens':
+            tensor_spec.TensorSpec(
+                shape=(time_sequence_length, self._tokens_per_context_image, 1,
+                       token_embedding_size),
+                dtype=tf.float32,
+                name='context_image_tokens'),
+        'action_tokens':
+            tensor_spec.TensorSpec(
+                shape=(time_sequence_length, self._tokens_per_action, 1, 1),
+                dtype=tf.int32,
+                name='action_tokens'),
+        # Stores where in the window we are.
+        # This value is within range [0, time_sequence_length + 1].
+        # When seq_idx == time_sequence_length, context_image_tokens and
+        # action_tokens need to be shifted to the left.
+        'seq_idx':
+            tensor_spec.TensorSpec(
+                shape=(1, 1, 1, 1), dtype=tf.int32, name='seq_idx')
+    }
+
+  @property
+  def attention_scores(self) -> list[tf.Tensor]:
+    """Return attention score. This is for debugging/visualization purpose."""
+    return self._attention_scores
+
+  def _get_action_index_for_token(self, k):
+    """Returns action associated with the token at given position `k`.
+
+    If k is not an action token then it returns -1.
+    If k is part of the first action in the sequence then returns 0 etc.
+
+    Args:
+        k: an int that represents the position in the sequence.
+
+    Returns:
+        The index of the action that this position belongs to, or if this
+        position is part of an image token then returns -1.
+    """
+    if (k < 0 or k >= self._all_num_tokens):
+      return -1
+
+    n = k
+    if n % self._single_time_step_num_tokens < self._tokens_per_context_image:
+      return -1
+    return int(n / self._single_time_step_num_tokens)
+
+  def _generate_masks(self):
+    """Generate mask for action prediction loss and attention visualization."""
+    # each time step = [image, action]
+    self._single_time_step_num_tokens = (
+        self._tokens_per_action + self._tokens_per_context_image)
+
+    # full sequence = [prefix context + N x timestep + postfix context]
+    self._all_num_tokens = (
+        self._time_sequence_length * self._single_time_step_num_tokens)
+
+    # create mask for action predition loss
+    self._action_tokens_mask = []
+    for n in range(0, self._all_num_tokens, self._single_time_step_num_tokens):
+      for x in range(0, self._tokens_per_action, 1):
+        self._action_tokens_mask.append(x + n + self._tokens_per_context_image)
+    self._action_tokens_mask = tf.constant(
+        self._action_tokens_mask, dtype=tf.int32)
+
+    # The look ahead mask ensures causality.
+    self._default_attention_mask = tf.linalg.band_part(
+        tf.ones((self._all_num_tokens, self._all_num_tokens)), -1, 0)
+
+    action_mask = np.ndarray(
+        shape=(self._all_num_tokens, self._all_num_tokens), dtype=int)
+    for i in range(self._all_num_tokens):
+      for j in range(self._all_num_tokens):
+        action_i = self._get_action_index_for_token(i)
+        action_j = self._get_action_index_for_token(j)
+        mask = 0
+        if action_i != -1 and action_j != -1:
+          # Ignore actions of previous steps.
+          if action_j < action_i:
+            mask = 1
+          # If we're not auto-regression, ignore action dimensions of current
+          # step.
+          if (action_j == action_i and j <= i):
+            mask = 1
+        action_mask[i, j] = mask
+    self._default_attention_mask -= action_mask
+
+  def _transformer_call(
+      self,
+      context_image_tokens: tf.Tensor,
+      action_tokens: tf.Tensor,
+      batch_size: int,
+      training: bool,
+      attention_mask: tf.Tensor,
+  ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
+    """Calls the transformer.
+
+    Args:
+      context_image_tokens: Tokenized context and image in Tensor of shape `(B,
+        T, num token, -1)`.
+      action_tokens: Discrete action token sequence of size [8, 256].
+      batch_size: Batch size as when reshaping all tokens.
+      training: Whether to run the transformer in training mode.
+      attention_mask: Optional bool tensor for masking transformer's attention.
+
+    Returns:
+      Output tokens in Tensor of shape `(B, T, dim)`. If
+      return_attention_scores, also return the attention scores of
+      shape `(B, T, dim)`.
+    """
+    input_token_sequence = self._assemble_input_token_sequence(
+        context_image_tokens, action_tokens, batch_size)
+
+    # run transformer
+    output_tokens, self._attention_scores = self._transformer(
+        input_token_sequence, training, attention_mask)
+    return output_tokens
+
+  def _get_tokens_and_mask(self,
+                           observations: dict[str, tf.Tensor],
+                           network_state: dict[str, tf.Tensor],
+                           training: bool = False):
+    # tokenize all inputs
+    context_image_tokens, network_state = self._tokenize_images(
+        observations, network_state, training)
+    action_tokens = self._tokenize_actions(observations, network_state)
+
+    # generate transformer attention mask
+    attention_mask = self._default_attention_mask
+
+    return (context_image_tokens, action_tokens, attention_mask)
+
+  def _transformer_call_and_slice(self,
+                                  *args,
+                                  slice_start: int = 0,
+                                  slice_length: int = 1,
+                                  **kwargs) -> Tuple[tf.Tensor, tf.Tensor]:
+    output_tokens = self._transformer_call(*args, **kwargs)
+
+    slice_end = slice_start + slice_length
+    token_logits = output_tokens[:, slice_start:slice_end, :]
+    token = tf.argmax(token_logits, axis=-1, output_type=tf.int32)
+
+    return token, token_logits
+
+  def call(self,
+           observations: dict[str, tf.Tensor],
+           network_state: dict[str, tf.Tensor],
+           training: bool = False):
+    """Calls the transformer network.
+
+    Args:
+      observations: Observation data including image and natural language
+        embedding in dict of Tensors.
+      network_state: Network state data including time step, image, action
+        tokens, step number in dict of Tensors.
+      training: Whether to call transformer network in training mode.
+
+    Returns:
+      A tuple `(Detokenized output actions, network state)`.
+    """
+    # used to determine training vs inference call
+    # outer_rank will be 2 -> [b, t] during training and
+    # outer_rank will be 1 -> [b] during inference
+    outer_rank = self._get_outer_rank(observations)
+    assert outer_rank in (1, 2)
+
+    b, t = self._get_batch_size_and_seq_len(network_state)
+
+    context_image_tokens, action_tokens, attention_mask = self._get_tokens_and_mask(
+        observations, network_state, training)
+
+    self._aux_info = {'action_labels': action_tokens}
+
+    if outer_rank == 1:  # This is an inference call
+      # run transformer in loop to produce action tokens one-by-one
+      # TODO(b/231896343): Document/comment more on what the following mess is.
+      seq_idx = tf.reshape(network_state['seq_idx'], [1])[0]
+      action_t = tf.minimum(seq_idx, self._time_sequence_length - 1)
+      # Transformer shifts all to the left by one step by default (it's usually
+      # predicting the next token as default training task...).
+      transformer_shift = -1
+      # We only want to get the action predicted at time_step.
+      start_index = (
+          transformer_shift + self._tokens_per_context_image + action_t *
+          (self._single_time_step_num_tokens))
+      current_action_tokens = []
+      action_predictions_logits = []
+      for k in range(self._tokens_per_action):
+        action_index = start_index + k
+        token, token_logits = self._transformer_call_and_slice(
+            context_image_tokens,
+            action_tokens,
+            attention_mask=attention_mask,
+            batch_size=b,
+            training=training,
+            slice_start=action_index  # slicing single action dimension
+        )
+        action_predictions_logits.append(token_logits)
+        current_action_tokens.append(token)
+        # action_tokens is [b, t * self._tokens_per_action]
+        action_tokens = tf.reshape(action_tokens, [b, -1])
+        action_start_index = (action_t * self._tokens_per_action) + k
+        action_tokens = tf.concat([
+            action_tokens[:, :action_start_index], token,
+            action_tokens[:, action_start_index + 1:]
+        ],
+                                  axis=1)
+        # action_tokens is [b, t, self._tokens_per_action]
+        action_tokens = tf.reshape(action_tokens,
+                                   [b, t, self._tokens_per_action])
+      self._aux_info.update({
+          # action_predictions_logits is
+          # [b, self._tokens_per_action, self._vocab_size]
+          'action_predictions_logits': tf.concat(action_predictions_logits, 1)
+      })
+      # predicted_tokens_for_output is [b, self._tokens_per_action]
+      predicted_tokens_for_output = tf.concat(current_action_tokens, 1)
+      # state_action_tokens is [b, 1, self._tokens_per_action, 1, 1]
+      one_state_action_tokens = predicted_tokens_for_output[:, tf.newaxis, :,
+                                                            tf.newaxis,
+                                                            tf.newaxis]
+
+      state_action_tokens = network_state['action_tokens']
+      network_state['action_tokens'] = tf.concat([
+          state_action_tokens[:, :action_t, ...], one_state_action_tokens,
+          state_action_tokens[:, action_t + 1:, ...]
+      ],
+                                                 axis=1)
+      # Increment the time_step for the next inference call.
+      network_state['seq_idx'] = tf.reshape(
+          tf.minimum(seq_idx + 1, self._time_sequence_length), [-1, 1, 1, 1, 1])
+
+      self._loss = tf.constant(0.0)
+    else:
+      # training call --> simply run one transformer forward pass
+      output_tokens = self._transformer_call(
+          context_image_tokens,
+          action_tokens,
+          attention_mask=attention_mask,
+          batch_size=b,
+          training=training)
+
+      # Gather all predicted actions for the action loss.
+      action_logits = tf.gather(
+          output_tokens, self._action_tokens_mask - 1, axis=1)
+      action_logits_for_training = tf.reshape(
+          action_logits, [b, t, self._tokens_per_action, -1])
+
+      # Only take the last action as the action.
+      # action_logits_for_output is [b, self._tokens_per_action, emb]
+      action_logits_for_output = action_logits_for_training[:, -1]
+
+      # predicted_tokens_for_output is [b, self._tokens_per_action]
+      predicted_tokens_for_output = tf.argmax(
+          action_logits_for_output, axis=-1, output_type=tf.int32)
+
+      num_items = (
+          tf.cast(b * t, tf.float32) * self._single_time_step_num_tokens)
+      action_loss = tf.reduce_mean(
+          self._loss_object(action_tokens, action_logits_for_training) /
+          num_items,
+          axis=-1)
+
+      self._loss = action_loss
+
+      # store action labels and predictions for visualization
+      self._aux_info.update({
+          'action_predictions':
+              tf.argmax(
+                  action_logits_for_training, axis=-1, output_type=tf.int32),
+          'action_loss':
+              action_loss,
+          'actor_loss_mask':
+              tf.ones([b], dtype=tf.float32)
+      })
+
+    output_actions = self._action_tokenizer.detokenize(
+        predicted_tokens_for_output)
+    return output_actions, network_state
+
+  def add_summaries(self, observations: dict[str, tf.Tensor],
+                    logging_info: dict[str, tf.Tensor], debug_summaries: bool,
+                    training: bool) -> None:
+    """Adds summaries.
+
+    Args:
+      observations: Observation data including image and natural language
+        instruction in dict of Tensors.
+      logging_info: Dict with all data stored for logging during training pass.
+      debug_summaries: Whether to include debug summaries.
+      training: Whether this function is called during training or inference.
+    """
+    num_params = 0
+    for weight in self.trainable_weights:
+      weight_params = 1
+      for dim in weight.shape:
+        weight_params *= dim
+      num_params += weight_params
+    tf.compat.v2.summary.scalar(name='num_params', data=num_params)
+    # debug_summaries are for the non-tpu worker, train_summary.
+    if debug_summaries:
+      image = observations['image']  # [b, t, h, w, c]
+      image_h = image.shape[2]
+      image_w = image.shape[3]
+      batch_size = image.shape[0]
+      num_ts = image.shape[1]
+      logging.info('image shape %s', image.shape)
+      # Concat images for different timesteps across width.
+      image = tf.concat(tf.unstack(image, axis=1), 2)
+      # Concat images for different batches (up to 8) across height.
+      image = tf.expand_dims(tf.concat(tf.unstack(image, axis=0)[0:8], 0), 0)
+      tf.summary.image(
+          'observations/image',
+          image,
+          step=self._train_step_counter,
+          # Single output since we have concatenated images along batch.
+          max_outputs=1)
+
+      # [b, t], strings
+      if 'natural_language_instruction' in observations:
+        task = observations['natural_language_instruction'][:, 0]
+        tf.summary.text(
+            'natural_language_instruction', task, step=self._train_step_counter)
+      if self.attention_scores and not self._use_token_learner:
+        for l_idx, layer_attention_score in enumerate(self.attention_scores):
+          logging.info('Attention score shape: %s, %s', l_idx,
+                       layer_attention_score.shape)
+          for head_idx in range(layer_attention_score.shape[1]):
+            pairwise_attention = tf.expand_dims(
+                layer_attention_score[:, head_idx], -1)
+            # pairwise attention shape (16, 552, 552, 1)
+            # make attention from different time steps comparable
+            pairwise_attention = pairwise_attention * np.arange(
+                1, pairwise_attention.shape[1] + 1)[None, :, None, None]
+
+            # visualize spatial attention, note this only supports
+            # mk1_500tasks_transformer pipeline with no token learner
+            img_tf_ts = tf.reshape(
+                tf.transpose(
+                    tf.reshape(
+                        tf.reduce_sum(pairwise_attention, axis=1) / np.arange(
+                            pairwise_attention.shape[1], 0, -1)[None, :, None],
+                        [batch_size, num_ts, -1]),
+                    [0, 2, 1])[:, :-self._tokens_per_action, :],
+                [-1, 9, 9, num_ts])
+
+            img_tf_ts = tf.image.resize(
+                img_tf_ts, [image_h, image_w],
+                method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
+            img_tf_ts_concat = tf.concat(tf.unstack(img_tf_ts, axis=3), 2)
+            img_tf_ts_concat_min = tf.reduce_min(
+                img_tf_ts_concat, axis=[1, 2], keepdims=True)
+            img_tf_ts_concat = (img_tf_ts_concat - img_tf_ts_concat_min) / (
+                tf.reduce_max(img_tf_ts_concat, axis=[1, 2], keepdims=True) -
+                img_tf_ts_concat_min)
+            img_tf_ts_concat = tf.concat(
+                tf.unstack(img_tf_ts_concat, axis=0)[:8], 0)
+            img_tf_ts_concat = tf.expand_dims(
+                tf.expand_dims(img_tf_ts_concat, 0), -1)
+            tf.summary.image(
+                'attention/layer_{}/head_{}'.format(l_idx, head_idx),
+                img_tf_ts_concat,
+                step=self._train_step_counter,
+                # Single output since we have concatenated images along batch.
+                max_outputs=1)
+
+            if img_tf_ts_concat.shape[1] == image.shape[
+                1] and img_tf_ts_concat.shape[2] == image.shape[2]:
+              # can overlay
+              overlay_viz = tf.cast(
+                  (tf.cast(image, tf.float32) * (0.2 + img_tf_ts_concat) / 1.2),
+                  tf.uint8)
+              tf.summary.image(
+                  'overlay_attention/layer_{}/head_{}'.format(l_idx, head_idx),
+                  overlay_viz,
+                  step=self._train_step_counter,
+                  # Single output since we have concatenated images along batch.
+                  max_outputs=1)
+
+    # log action info
+    action_labels = tf.boolean_mask(logging_info['action_labels'],
+                                    logging_info['actor_loss_mask'])
+    action_predictions = tf.boolean_mask(logging_info['action_predictions'],
+                                         logging_info['actor_loss_mask'])
+    with tf.name_scope('ActionTokens'):
+      token_accuracy = (
+          tf.cast(tf.equal(action_labels, action_predictions), tf.float32))
+      accuracy = tf.reduce_mean(token_accuracy)
+      tf.compat.v2.summary.scalar(
+          name='accuracy', data=accuracy, step=self._train_step_counter)
+      # Accuracy across timesteps
+      for t in range(self._time_sequence_length):
+        tf.compat.v2.summary.scalar(
+            name='accuracy/time_step/{}'.format(t),
+            data=tf.reduce_mean(token_accuracy[:, t, :]),
+            step=self._train_step_counter)
+      token_index = 0
+      for k in self._action_tokenizer.action_order:
+        spec = self._action_tokenizer.action_spec[k]
+        if spec.dtype == tf.int32:
+          n_tokens = 1
+        else:
+          n_tokens = spec.shape[0]
+        action_token_accuracy = tf.reduce_mean(
+            token_accuracy[:, :, token_index:token_index + n_tokens])
+        tf.compat.v2.summary.scalar(
+            name='accuracy/action_type/{}'.format(k),
+            data=action_token_accuracy,
+            step=self._train_step_counter)
+        for n in range(n_tokens):
+          tf.summary.histogram(
+              'tokens/{}_{}/labels'.format(k, n + 1),
+              action_labels[:, :, token_index],
+              step=self._train_step_counter)
+          tf.summary.histogram(
+              'tokens/{}_{}/predictions'.format(k, n + 1),
+              action_predictions[:, :, token_index],
+              step=self._train_step_counter)
+          token_index += 1
+
+    # log loss components
+    with tf.name_scope('TokenLosses'):
+      tf.compat.v2.summary.scalar(
+          name='action_loss',
+          data=tf.reduce_mean(logging_info['action_loss']),
+          step=self._train_step_counter)
+
+  def _tokenize_images(self, observations, network_state, training):
+    image = observations['image']  # [b, t, h, w, c]
+    outer_rank = self._get_outer_rank(observations)
+    if outer_rank == 1:  # This is an inference call
+      seq_idx = tf.reshape(network_state['seq_idx'], [1])[0]
+      time_step = tf.minimum(seq_idx, self._time_sequence_length - 1)
+      image = tf.expand_dims(image, 1)
+
+    # TODO(b/255731285)
+    image_shape = tf.shape(image)
+    b = image_shape[0]
+    input_t = image_shape[1]
+    h = image_shape[2]
+    w = image_shape[3]
+    c = image_shape[4]
+
+    context = self._extract_context_from_observation(observations, input_t)
+
+    image = tf.reshape(image, [b * input_t, h, w, c])
+    seed = tf.random.uniform(shape=(2,), maxval=2**30, dtype=tf.int32)
+    image = preprocessors.convert_dtype_and_crop_images(
+        image,
+        crop_size=self._crop_size,
+        training=training,
+        pad_then_crop=True,
+        convert_dtype=True,
+        seed=seed)
+    image = tf.reshape(image, [b, input_t, h, w, c])
+    context_image_tokens = self._image_tokenizer(
+        image, context=context, training=training)
+    num_tokens = tf.shape(context_image_tokens)[2]
+    context_image_tokens = tf.reshape(context_image_tokens,
+                                      [b, input_t, num_tokens, 1, -1])
+    if outer_rank == 1:  # This is an inference call
+      network_state['context_image_tokens'] = tf.reshape(
+          network_state['context_image_tokens'], [
+              b, self._time_sequence_length, self._tokens_per_context_image, 1,
+              -1
+          ])
+      state_image_tokens = network_state['context_image_tokens']
+      # network_state as input for this call is the output from the last call.
+      # Therefore, we need to shift all images to the left by 1 in the time axis
+      # to align w/ the time dim in this call.
+      state_image_tokens = tf.cond(
+          seq_idx == self._time_sequence_length,
+          lambda: tf.roll(state_image_tokens, -1, axis=1),
+          lambda: state_image_tokens)
+
+      context_image_tokens = tf.concat([
+          state_image_tokens[:, :time_step, ...], context_image_tokens,
+          state_image_tokens[:, time_step + 1:, ...]
+      ],
+                                       axis=1)
+      network_state['context_image_tokens'] = context_image_tokens
+
+    return context_image_tokens, network_state
+
+  def _tokenize_actions(self, observations, network_state):
+    outer_rank = self._get_outer_rank(observations)
+    if outer_rank == 1:  # This is an inference call
+      # TODO(b/231896343): Clarify what is going on with the network state
+      # tensors, currently they all have to be the same n_dims so we have to
+      # add/remove dummy dims.
+      action_tokens = tf.squeeze(network_state['action_tokens'], [3, 4])
+      seq_idx = tf.reshape(network_state['seq_idx'], [1])[0]
+      # network_state as input for this call is the output from the last call.
+      # Therefore, we need to shift all actions by 1 to the left.
+      action_tokens = tf.cond(seq_idx == self._time_sequence_length,
+                              lambda: tf.roll(action_tokens, -1, axis=1),
+                              lambda: action_tokens)
+    else:
+      assert outer_rank == 2
+      if self._actions is None:
+        b, t = self._get_batch_size_and_seq_len(network_state)
+        action_tokens = tf.zeros(
+            shape=[b, t, self._tokens_per_action], dtype=tf.int32)
+      else:
+        action_tokens = self._action_tokenizer.tokenize(self._actions)
+    return action_tokens
+
+  def _assemble_input_token_sequence(self, context_image_tokens, action_tokens,
+                                     batch_size):
+    # embed action tokens
+    action_tokens = tf.one_hot(action_tokens, self._vocab_size)
+    action_tokens = self._action_token_emb(action_tokens)
+    action_tokens = tf.zeros_like(action_tokens)  # b/260260205
+
+    # Because of b/254902773, we need to add 1 extra dimension.
+    action_tokens = tf.expand_dims(action_tokens, axis=-2)
+
+    # assemble token sequence
+    input_token_sequence = tf.concat([context_image_tokens, action_tokens],
+                                     axis=2)
+
+    input_token_sequence = tf.reshape(
+        input_token_sequence, [batch_size, -1, self._token_embedding_size])
+    return input_token_sequence
+
+  def _extract_context_from_observation(self, observations, seq_len):
+    """Extract context from observation."""
+    context = None
+    if 'natural_language_embedding' in observations:
+      outer_rank = self._get_outer_rank(observations)
+      context = observations['natural_language_embedding']  # [b, t, emb-size]
+      if outer_rank == 1:
+        context = tf.tile(context[:, None], [1, seq_len, 1])
+    return context
+
+  def set_actions(self, actions: tensorspec_utils.TensorSpecStruct):
+    """Sets actions that will be tokenized and used in transformer network.
+
+    Args:
+      actions: actions to be tokenized and used in transformer network. example
+        actions are terminate = [0, 1] world_vector = [0.9, 0.8, -0.3]
+        rotation_delta = [-0.1, 0.2, .6] gripper_closedness = 0.9
+    """
+    self._actions = actions
+
+  def _get_outer_rank(self, observations):
+    # used to determine training vs inference call
+    # outer_rank will be 2 -> [b, t] during training and
+    # outer_rank will be 1 -> [b] during inference
+    return nest_utils.get_outer_rank(observations, self._input_tensor_spec)
+
+  def _get_batch_size_and_seq_len(self, network_state):
+    image_shape = tf.shape(network_state['context_image_tokens'])
+    b = image_shape[0]
+    t = image_shape[1]
+    return b, t
+
+  def get_actor_loss(self) -> tf.Tensor:
+    return self._loss
+
+  def get_aux_info(self) -> dict[str, Any]:
+    return self._aux_info
--- a/transformer_network_test.py
+++ b/transformer_network_test.py
@ -0,0 +1,229 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for networks."""
+
+from absl.testing import parameterized
+
+from robotics_transformer import transformer_network
+from robotics_transformer.transformer_network_test_set_up import BATCH_SIZE
+from robotics_transformer.transformer_network_test_set_up import NAME_TO_INF_OBSERVATIONS
+from robotics_transformer.transformer_network_test_set_up import NAME_TO_STATE_SPECS
+from robotics_transformer.transformer_network_test_set_up import observations_list
+from robotics_transformer.transformer_network_test_set_up import spec_names_list
+from robotics_transformer.transformer_network_test_set_up import state_spec_list
+from robotics_transformer.transformer_network_test_set_up import TIME_SEQUENCE_LENGTH
+from robotics_transformer.transformer_network_test_set_up import TransformerNetworkTestUtils
+
+import tensorflow as tf
+from tf_agents.specs import tensor_spec
+
+
+class TransformerNetworkTest(TransformerNetworkTestUtils):
+
+  # pylint:disable=g-complex-comprehension
+  @parameterized.named_parameters([{
+      'testcase_name': '_' + name,
+      'state_spec': spec,
+      'train_observation': obs,
+  } for (name, spec,
+         obs) in zip(spec_names_list(), state_spec_list(), observations_list())]
+                                 )
+  # pylint:enable=g-complex-comprehension
+  def testTransformerTrainLossCall(self, state_spec, train_observation):
+    network = transformer_network.TransformerNetwork(
+        input_tensor_spec=state_spec,
+        output_tensor_spec=self._action_spec,
+        time_sequence_length=TIME_SEQUENCE_LENGTH)
+
+    network.create_variables()
+    self.assertNotEmpty(network.variables)
+
+    network.set_actions(self._train_action)
+    network_state = tensor_spec.sample_spec_nest(
+        network.state_spec, outer_dims=[BATCH_SIZE])
+    output_actions, network_state = network(
+        train_observation, step_type=None, network_state=network_state)
+    expected_shape = [2, 3]
+    self.assertEqual(network.get_actor_loss().shape,
+                     tf.TensorShape(expected_shape))
+    self.assertCountEqual(self._train_action.keys(), output_actions.keys())
+
+  # pylint:disable=g-complex-comprehension
+  @parameterized.named_parameters([{
+      'testcase_name': '_' + name,
+      'spec_name': name,
+  } for name in spec_names_list()])
+  # pylint:enable=g-complex-comprehension
+  def testTransformerInferenceLossCall(self, spec_name):
+    state_spec = NAME_TO_STATE_SPECS[spec_name]
+    observation = NAME_TO_INF_OBSERVATIONS[spec_name]
+
+    network = transformer_network.TransformerNetwork(
+        input_tensor_spec=state_spec,
+        output_tensor_spec=self._action_spec,
+        time_sequence_length=TIME_SEQUENCE_LENGTH,
+        action_order=[
+            'terminate_episode', 'world_vector', 'rotation_delta',
+            'gripper_closedness_action'
+        ])
+    network.create_variables()
+    self.assertNotEmpty(network.variables)
+
+    network.set_actions(self._inference_action)
+    # inference currently only support batch size of 1
+    network_state = tensor_spec.sample_spec_nest(
+        network.state_spec, outer_dims=[1])
+
+    output_actions, network_state = network(
+        observation, step_type=None, network_state=network_state)
+
+    tf.debugging.assert_equal(network.get_actor_loss(), 0.0)
+    self.assertCountEqual(self._inference_action.keys(), output_actions.keys())
+
+  # pylint:disable=g-complex-comprehension
+  @parameterized.named_parameters([{
+      'testcase_name': '_' + name,
+      'state_spec': spec,
+      'train_observation': obs,
+  } for name, spec, obs in zip(spec_names_list(), state_spec_list(),
+                               observations_list())])
+  # pylint:enable=g-complex-comprehension
+  def testTransformerLogging(self, state_spec, train_observation):
+    network = transformer_network.TransformerNetwork(
+        input_tensor_spec=state_spec,
+        output_tensor_spec=self._action_spec,
+        time_sequence_length=TIME_SEQUENCE_LENGTH,
+        action_order=[
+            'terminate_episode', 'world_vector', 'rotation_delta',
+            'gripper_closedness_action'
+        ])
+
+    network.create_variables()
+    self.assertNotEmpty(network.variables)
+
+    network.set_actions(self._train_action)
+    network_state = tensor_spec.sample_spec_nest(
+        network.state_spec, outer_dims=[BATCH_SIZE])
+    _ = network(train_observation, step_type=None, network_state=network_state)
+    network.add_summaries(
+        train_observation,
+        network.get_aux_info(),
+        debug_summaries=True,
+        training=True)
+
+  # pylint:disable=g-complex-comprehension
+  @parameterized.named_parameters([{
+      'testcase_name': '_' + name,
+      'state_spec': spec,
+  } for name, spec in zip(spec_names_list(), state_spec_list())])
+  # pylint:enable=g-complex-comprehension
+  def testTransformerCausality(self, state_spec):
+    """Tests the causality for the transformer.
+
+    Args:
+      state_spec: Which state spec to test the transformer with
+    """
+    network = transformer_network.TransformerNetwork(
+        input_tensor_spec=state_spec,
+        output_tensor_spec=self._action_spec,
+        time_sequence_length=TIME_SEQUENCE_LENGTH)
+    network.create_variables()
+    self.assertNotEmpty(network.variables)
+
+    time_sequence_length = network._time_sequence_length
+    tokens_per_image = network._tokens_per_context_image
+    tokens_per_action = network._tokens_per_action
+
+    def _split_image_and_action_tokens(all_tokens):
+      image_start_indices = [(tokens_per_image + tokens_per_action) * k
+                             for k in range(time_sequence_length)]
+      image_tokens = tf.stack(
+          [all_tokens[i:i + tokens_per_image] for i in image_start_indices],
+          axis=0)
+      action_start_indices = [i + tokens_per_image for i in image_start_indices]
+      action_tokens = [
+          tf.stack([
+              all_tokens[i:i + tokens_per_action] for i in action_start_indices
+          ], 0)
+      ]
+      image_tokens = tf.one_hot(image_tokens, network._token_embedding_size)
+      # Remove extra dimension before the end once b/254902773 is fixed.
+      shape = image_tokens.shape
+      # Add batch dimension.
+      image_tokens = tf.reshape(image_tokens,
+                                [1] + shape[:-1] + [1] + shape[-1:])
+      return image_tokens, action_tokens
+
+    # Generate some random tokens for image and actions.
+    all_tokens = tf.random.uniform(
+        shape=[time_sequence_length * (tokens_per_image + tokens_per_action)],
+        dtype=tf.int32,
+        maxval=10,
+        minval=0)
+    context_image_tokens, action_tokens = _split_image_and_action_tokens(
+        all_tokens)
+    # Get the output tokens without any zeroed out input tokens.
+    output_tokens = network._transformer_call(
+        context_image_tokens=context_image_tokens,
+        action_tokens=action_tokens,
+        attention_mask=network._default_attention_mask,
+        batch_size=1,
+        training=False)[0]
+
+    for t in range(time_sequence_length *
+                   (tokens_per_image + tokens_per_action)):
+      # Zero out future input tokens.
+      all_tokens_at_t = tf.concat(
+          [all_tokens[:t + 1],
+           tf.zeros_like(all_tokens[t + 1:])], 0)
+      context_image_tokens, action_tokens = _split_image_and_action_tokens(
+          all_tokens_at_t)
+      # Get the output tokens with zeroed out input tokens after t.
+      output_tokens_at_t = network._transformer_call(
+          context_image_tokens=context_image_tokens,
+          action_tokens=action_tokens,
+          attention_mask=network._default_attention_mask,
+          batch_size=1,
+          training=False)[0]
+      # The output token is unchanged if future input tokens are zeroed out.
+      self.assertAllEqual(output_tokens[:t + 1], output_tokens_at_t[:t + 1])
+
+  def testLossMasks(self):
+    self._define_specs()
+    self._create_agent()
+    image_tokens = 3
+    action_tokens = 2
+    self._agent._actor_network._time_sequence_length = 2
+    self._agent._actor_network._tokens_per_context_image = image_tokens
+    self._agent._actor_network._tokens_per_action = action_tokens
+    self._agent._actor_network._generate_masks()
+    self.assertAllEqual(
+        self._agent._actor_network._action_tokens_mask,
+        tf.constant([
+            image_tokens, image_tokens + 1, 2 * image_tokens + action_tokens,
+            2 * image_tokens + action_tokens + 1
+        ], tf.int32))
+    self._agent._actor_network._generate_masks()
+    self.assertAllEqual(
+        self._agent._actor_network._action_tokens_mask,
+        tf.constant([
+            image_tokens, image_tokens + 1, 2 * (image_tokens) + action_tokens,
+            2 * (image_tokens) + action_tokens + 1
+        ], tf.int32))
+
+
+if __name__ == '__main__':
+  # Useful to enable if running with ipdb.
+  tf.config.run_functions_eagerly(True)
+  tf.test.main()
--- a/transformer_network_test_set_up.py
+++ b/transformer_network_test_set_up.py
@ -0,0 +1,391 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for networks."""
+
+import copy
+from typing import Optional, Tuple, Union
+
+from absl.testing import parameterized
+import numpy as np
+from robotics_transformer import sequence_agent
+from robotics_transformer import transformer_network
+from tensor2robot.utils import tensorspec_utils
+import tensorflow as tf
+from tf_agents.specs import tensor_spec
+from tf_agents.trajectories import time_step as ts
+
+BATCH_SIZE = 2
+TIME_SEQUENCE_LENGTH = 3
+HEIGHT = 256
+WIDTH = 320
+NUM_IMAGE_TOKENS = 2
+
+
+def spec_names_list() -> list[str]:
+  """Lists the different types of specs accepted by the transformer."""
+  return ['default']
+
+
+def state_spec_list() -> list[tensorspec_utils.TensorSpecStruct]:
+  """Lists the different types of state spec accepted by the transformer."""
+  state_spec = tensorspec_utils.TensorSpecStruct()
+  state_spec.image = tensor_spec.BoundedTensorSpec([HEIGHT, WIDTH, 3],
+                                                   dtype=tf.float32,
+                                                   name='image',
+                                                   minimum=0.,
+                                                   maximum=1.)
+  state_spec.natural_language_embedding = tensor_spec.TensorSpec(
+      shape=[512], dtype=tf.float32, name='natural_language_embedding')
+
+  state_spec_mask = copy.deepcopy(state_spec)
+  state_spec_mask.initial_binary_mask = tensor_spec.BoundedTensorSpec(
+      [HEIGHT, WIDTH, 1],
+      dtype=tf.int32,
+      name='initial_binary_mask',
+      minimum=0,
+      maximum=255)
+
+  state_spec_tcl = copy.deepcopy(state_spec)
+  state_spec_tcl.original_image = tensor_spec.BoundedTensorSpec(
+      [HEIGHT, WIDTH, 3],
+      dtype=tf.float32,
+      name='original_image',
+      minimum=0.,
+      maximum=1.)
+
+  return [
+      state_spec,
+      state_spec_mask,
+      state_spec_tcl,
+  ]
+
+
+def observations_list(training: bool = True) -> list[dict[str, tf.Tensor]]:
+  """Lists the different types of observations accepted by the transformer."""
+  if training:
+    image_shape = [BATCH_SIZE, TIME_SEQUENCE_LENGTH, HEIGHT, WIDTH, 3]
+    emb_shape = [BATCH_SIZE, TIME_SEQUENCE_LENGTH, 512]
+    mask_shape = [BATCH_SIZE, TIME_SEQUENCE_LENGTH, HEIGHT, WIDTH, 1]
+  else:
+    # inference currently only support batch size of 1
+    image_shape = [1, HEIGHT, WIDTH, 3]
+    emb_shape = [1, 512]
+    mask_shape = [1, HEIGHT, WIDTH, 1]
+  return [
+      {
+          'image': tf.constant(0.5, shape=image_shape),
+          'natural_language_embedding': tf.constant(1., shape=emb_shape),
+      },
+      {
+          'image': tf.constant(0.5, shape=image_shape),
+          'natural_language_embedding': tf.constant(1., shape=emb_shape),
+          'initial_binary_mask': tf.constant(192, shape=mask_shape),
+      },
+      {  # This is used for TCL.
+          'image': tf.constant(0.5, shape=image_shape),
+          'original_image': tf.constant(0.4, shape=image_shape),
+          'natural_language_embedding': tf.constant(1., shape=emb_shape),
+      },
+  ]
+
+
+NAME_TO_STATE_SPECS = dict(zip(spec_names_list(), state_spec_list()))
+NAME_TO_OBSERVATIONS = dict(zip(spec_names_list(), observations_list()))
+NAME_TO_INF_OBSERVATIONS = dict(
+    zip(spec_names_list(), observations_list(False)))
+
+
+class FakeImageTokenizer(tf.keras.layers.Layer):
+  """Fake Image Tokenizer for testing Transformer."""
+
+  def __init__(self,
+               encoder: ...,
+               position_embedding: ...,
+               embedding_output_dim: int,
+               patch_size: int,
+               use_token_learner: bool = False,
+               num_tokens: int = NUM_IMAGE_TOKENS,
+               use_initial_binary_mask: bool = False,
+               **kwargs):
+    del encoder, position_embedding, patch_size, use_token_learner
+    super().__init__(**kwargs)
+    self.tokens_per_context_image = num_tokens
+    if use_initial_binary_mask:
+      self.tokens_per_context_image += 1
+    self.embedding_output_dim = embedding_output_dim
+    self.use_initial_binary_mask = use_initial_binary_mask
+
+  def __call__(self,
+               image: tf.Tensor,
+               context: Optional[tf.Tensor] = None,
+               initial_binary_mask: Optional[tf.Tensor] = None,
+               training: bool = False) -> tf.Tensor:
+    if self.use_initial_binary_mask:
+      assert initial_binary_mask is not None
+    image_shape = tf.shape(image)
+    seq_size = image_shape[1]
+    batch_size = image_shape[0]
+    all_tokens = []
+    num_tokens = self.tokens_per_context_image
+    for t in range(seq_size):
+      tokens = tf.ones([batch_size, 1, num_tokens, self.embedding_output_dim
+                       ]) * image[0][t][0][0]
+      all_tokens.append(tokens)
+    return tf.concat(all_tokens, axis=1)
+
+
+class TransformerNetworkTestUtils(tf.test.TestCase, parameterized.TestCase):
+  """Defines specs, SequenceAgent, and various other testing utilities."""
+
+  def _define_specs(self,
+                    train_batch_size=BATCH_SIZE,
+                    inference_batch_size=1,
+                    time_sequence_length=TIME_SEQUENCE_LENGTH,
+                    inference_sequence_length=TIME_SEQUENCE_LENGTH,
+                    token_embedding_size=512,
+                    image_width=WIDTH,
+                    image_height=HEIGHT):
+    """Defines specs and observations (both training and inference)."""
+    self.train_batch_size = train_batch_size
+    self.inference_batch_size = inference_batch_size
+    self.time_sequence_length = time_sequence_length
+    self.inference_sequence_length = inference_sequence_length
+    self.token_embedding_size = token_embedding_size
+    action_spec = tensorspec_utils.TensorSpecStruct()
+    action_spec.world_vector = tensor_spec.BoundedTensorSpec(
+        (3,), dtype=tf.float32, minimum=-1., maximum=1., name='world_vector')
+
+    action_spec.rotation_delta = tensor_spec.BoundedTensorSpec(
+        (3,),
+        dtype=tf.float32,
+        minimum=-np.pi / 2,
+        maximum=np.pi / 2,
+        name='rotation_delta')
+
+    action_spec.gripper_closedness_action = tensor_spec.BoundedTensorSpec(
+        (1,),
+        dtype=tf.float32,
+        minimum=-1.,
+        maximum=1.,
+        name='gripper_closedness_action')
+    action_spec.terminate_episode = tensor_spec.BoundedTensorSpec(
+        (2,), dtype=tf.int32, minimum=0, maximum=1, name='terminate_episode')
+
+    state_spec = tensorspec_utils.TensorSpecStruct()
+    state_spec.image = tensor_spec.BoundedTensorSpec(
+        [image_height, image_width, 3],
+        dtype=tf.float32,
+        name='image',
+        minimum=0.,
+        maximum=1.)
+    state_spec.natural_language_embedding = tensor_spec.TensorSpec(
+        shape=[self.token_embedding_size],
+        dtype=tf.float32,
+        name='natural_language_embedding')
+    self._policy_info_spec = {
+        'return':
+            tensor_spec.BoundedTensorSpec((),
+                                          dtype=tf.float32,
+                                          minimum=0.0,
+                                          maximum=1.0,
+                                          name='return'),
+        'discounted_return':
+            tensor_spec.BoundedTensorSpec((),
+                                          dtype=tf.float32,
+                                          minimum=0.0,
+                                          maximum=1.0,
+                                          name='discounted_return'),
+    }
+
+    self._state_spec = state_spec
+    self._action_spec = action_spec
+
+    self._inference_observation = {
+        'image':
+            tf.constant(
+                1,
+                shape=[self.inference_batch_size, image_height, image_width, 3],
+                dtype=tf.dtypes.float32),
+        'natural_language_embedding':
+            tf.constant(
+                1.,
+                shape=[self.inference_batch_size, self.token_embedding_size],
+                dtype=tf.dtypes.float32),
+    }
+    self._train_observation = {
+        'image':
+            tf.constant(
+                0.5,
+                shape=[
+                    self.train_batch_size, self.time_sequence_length,
+                    image_height, image_width, 3
+                ]),
+        'natural_language_embedding':
+            tf.constant(
+                1.,
+                shape=[
+                    self.train_batch_size, self.time_sequence_length,
+                    self.token_embedding_size
+                ]),
+    }
+    self._inference_action = {
+        'world_vector':
+            tf.constant(0.5, shape=[self.inference_batch_size, 3]),
+        'rotation_delta':
+            tf.constant(0.5, shape=[self.inference_batch_size, 3]),
+        'terminate_episode':
+            tf.constant(
+                [0, 1] * self.inference_batch_size,
+                shape=[self.inference_batch_size, 2]),
+        'gripper_closedness_action':
+            tf.constant(0.5, shape=[self.inference_batch_size, 1]),
+    }
+    self._train_action = {
+        'world_vector':
+            tf.constant(
+                0.5,
+                shape=[self.train_batch_size, self.time_sequence_length, 3]),
+        'rotation_delta':
+            tf.constant(
+                0.5,
+                shape=[self.train_batch_size, self.time_sequence_length, 3]),
+        'terminate_episode':
+            tf.constant(
+                [0, 1] * self.train_batch_size * self.time_sequence_length,
+                shape=[self.train_batch_size, self.time_sequence_length, 2]),
+        'gripper_closedness_action':
+            tf.constant(
+                0.5,
+                shape=[self.train_batch_size, self.time_sequence_length, 1]),
+    }
+
+  def _create_agent(self, actor_network=None):
+    """Creates SequenceAgent using custom actor_network."""
+    time_step_spec = ts.time_step_spec(observation_spec=self._state_spec)
+    if actor_network is None:
+      actor_network = transformer_network.TransformerNetwork
+
+    self._agent = sequence_agent.SequenceAgent(
+        time_step_spec=time_step_spec,
+        action_spec=self._action_spec,
+        actor_network=actor_network,
+        actor_optimizer=tf.keras.optimizers.Adam(),
+        train_step_counter=tf.compat.v1.train.get_or_create_global_step(),
+        time_sequence_length=TIME_SEQUENCE_LENGTH)
+    self._num_action_tokens = (
+        # pylint:disable=protected-access
+        self._agent._actor_network._action_tokenizer._tokens_per_action)
+    # pylint:enable=protected-access
+
+  def setUp(self):
+    self._define_specs()
+    super().setUp()
+
+  def get_image_value(self, step_idx: int) -> float:
+    return float(step_idx) / self.time_sequence_length
+
+  def get_action_logits(self, batch_size: int, value: int,
+                        vocab_size: int) -> tf.Tensor:
+    return tf.broadcast_to(
+        tf.one_hot(value % vocab_size, vocab_size)[tf.newaxis, tf.newaxis, :],
+        [batch_size, 1, vocab_size])
+
+  def create_obs(self, value) -> dict[str, tf.Tensor]:
+    observations = {}
+    observations['image'] = value * self._inference_observation['image']
+    observations[
+        'natural_language_embedding'] = value * self._inference_observation[
+            'natural_language_embedding']
+    return observations
+
+  def fake_action_token_emb(self, action_tokens) -> tf.Tensor:
+    """Just pad with zeros."""
+    shape = action_tokens.shape
+    assert self.vocab_size > self.token_embedding_size
+    assert len(shape) == 4
+    return action_tokens[:, :, :, :self.token_embedding_size]
+
+  def fake_transformer(
+      self, all_tokens, training,
+      attention_mask) -> Union[tf.Tensor, Tuple[tf.Tensor, list[tf.Tensor]]]:
+    """Fakes the call to TransformerNetwork._transformer."""
+    del training
+    del attention_mask
+    # We expect ST00 ST01 A00 A01...
+    # Where:
+    # * ST01 is token 1 of state 0.
+    # * A01 is token 1 of action 0.
+    shape = all_tokens.shape.as_list()
+    batch_size = shape[0]
+    self.assertEqual(batch_size, 1)
+    emb_size = self.token_embedding_size
+
+    # transform to [batch_size, num_tokens, token_size]
+    all_tokens = tf.reshape(all_tokens, [batch_size, -1, emb_size])
+    # Pads tokens to be of vocab_size.
+    self.assertGreater(self.vocab_size, self.token_embedding_size)
+    all_shape = all_tokens.shape
+    self.assertLen(all_shape.as_list(), 3)
+    output_tokens = tf.concat([
+        all_tokens,
+        tf.zeros([
+            all_shape[0], all_shape[1],
+            self.vocab_size - self.token_embedding_size
+        ])
+    ],
+                              axis=-1)
+    num_tokens_per_step = NUM_IMAGE_TOKENS + self._num_action_tokens
+    # Check state/action alignment.
+    window_range = min(self._step_idx + 1, self.time_sequence_length)
+    for j in range(window_range):
+      # The index step that is stored in j = 0.
+      first_step_idx = max(0, self._step_idx + 1 - self.time_sequence_length)
+      image_idx = j * num_tokens_per_step
+      action_start_index = image_idx + NUM_IMAGE_TOKENS
+      for t in range(NUM_IMAGE_TOKENS):
+        self.assertAllEqual(
+            self.get_image_value(first_step_idx + j) *
+            tf.ones_like(all_tokens[0][image_idx][:self.token_embedding_size]),
+            all_tokens[0][image_idx + t][:self.token_embedding_size])
+      # if j is not the current step in the window, all action dimensions
+      # from previous steps are already infered and thus can be checked.
+      action_dims_range = self.action_inf_idx if j == window_range - 1 else self._num_action_tokens
+      for t in range(action_dims_range):
+        token_idx = action_start_index + t
+        action_value = (first_step_idx + j) * self._num_action_tokens + t
+        self.assertAllEqual(
+            self.get_action_logits(
+                batch_size=batch_size,
+                value=action_value,
+                vocab_size=self.vocab_size)[0][0][:self.token_embedding_size],
+            all_tokens[0][token_idx][:self.token_embedding_size])
+    # Output the right action dimension value.
+    image_token_index = (
+        min(self._step_idx, self.time_sequence_length - 1) *
+        num_tokens_per_step)
+    transformer_shift = -1
+    action_index = (
+        image_token_index + NUM_IMAGE_TOKENS + self.action_inf_idx +
+        transformer_shift)
+    action_value = self._step_idx * self._num_action_tokens + self.action_inf_idx
+    action_logits = self.get_action_logits(
+        batch_size=batch_size, value=action_value, vocab_size=self.vocab_size)
+    output_tokens = tf.concat([
+        output_tokens[:, :action_index, :], action_logits[:, :, :],
+        output_tokens[:, action_index + 1:, :]
+    ],
+                              axis=1)
+    self.action_inf_idx = (self.action_inf_idx + 1) % self._num_action_tokens
+    attention_scores = []
+    return output_tokens, attention_scores
--- a/transformer_test.py
+++ b/transformer_test.py
@ -0,0 +1,55 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for transformer."""
+from absl.testing import parameterized
+from robotics_transformer import transformer
+import tensorflow as tf
+
+
+class TransformerTest(parameterized.TestCase):
+
+  def setUp(self):
+    self._vocab_size = 10
+    batch_size = 8
+    sequence_len = 12
+    self._tokens = tf.random.uniform(
+        [batch_size, sequence_len, self._vocab_size],
+        minval=0,
+        maxval=1,
+        dtype=tf.dtypes.float32,
+    )
+    super(TransformerTest, self).setUp()
+
+  @parameterized.parameters(True, False)
+  def test_transformer_forwardpass(self, return_attention_scores):
+    network = transformer.Transformer(
+        num_layers=2,
+        layer_size=512,
+        num_heads=4,
+        feed_forward_size=256,
+        dropout_rate=0.1,
+        vocab_size=self._vocab_size,
+        return_attention_scores=return_attention_scores)
+
+    output_tokens, attention_scores = network(self._tokens, attention_mask=None)
+    self.assertSequenceEqual(self._tokens.shape.as_list(),
+                             output_tokens.shape.as_list())
+    if return_attention_scores:
+      self.assertNotEmpty(attention_scores)
+    else:
+      self.assertEmpty(attention_scores)
+
+
+if __name__ == '__main__':
+  tf.test.main()