# Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """A simple action tokenizer used with Robotics Transformer 1. As an example, if an action is: terminate = [0, 1] world_vector = [0.9, 0.8, -0.3] rotation_delta = [-0.1, 0.2, .6] gripper_closedness = 0.9 Then we build a sequence of tokens of length 8 [one for each dimension]. The int32 type action dimensions are already assumed discrete and tokenized, the float dimensions are bucketed according to the specs min and max. Each dimension has 'vocab_size' buckets. Currently, this tokenizer assumes one action spec and it is highly recommended to specify the 'action_order', eg [terminate, world_vector, rotation_delta, gripper_closedness]. Since after tokenization you lose that information, this will be useful for debugging. Actions may also be subselected for prediction, since not all actions are needed in the action_order. """ from typing import Optional from tensor2robot.utils import tensorspec_utils import tensorflow as tf class RT1ActionTokenizer: """Tokenizes based on vocab size.""" def __init__(self, action_spec: tensorspec_utils.TensorSpecStruct, vocab_size: int, action_order: Optional[list[str]] = None): """Instantiates an RT1ActionTokenizer. Args: action_spec: Tensor spec of the expected action tensor. vocab_size: Number of buckets to discretize action to. action_order: Order of the action names, used to discern the order of tokenized actions to detokenize and assemble back to action tensor """ self._action_spec = action_spec self._vocab_size = vocab_size if action_order is None: self._action_order = self._action_spec.keys() else: for action in action_order: if action not in self._action_spec.keys(): raise ValueError('actions: %s not found in action_spec: %s' % (action, action_spec.keys())) assert action in self._action_spec.keys() self._action_order = action_order self._tokens_per_action = 0 for action in self._action_order: action_shape = self._action_spec[action].shape if len(action_shape) != 1: raise ValueError( 'Only action shapes with single dimension supported, got %s' % action_shape) if self._action_spec[action].dtype == tf.int32: # Int32 actions are already assumed to be tokens. self._tokens_per_action += 1 else: self._tokens_per_action += action_shape[0] # We measure # of action tokens in two different way. One is by checking # from action_order (above) and the other is by looping through the # action spec (below). We aseert the # of action tokens are the same # calculated by these two ways. This will assure action_order is correctly # configured, otherwise, it will through an error in the assert. num_action_token = 0 for spec in self._action_spec.values(): if spec.dtype == tf.int32: num_action_token += 1 else: num_action_token += spec.shape[-1] tf.debugging.assert_equal(num_action_token, self._tokens_per_action) @property def tokens_per_action(self) -> int: return self._tokens_per_action @property def action_spec(self) -> tensorspec_utils.TensorSpecStruct: return self._action_spec @property def action_order(self) -> list[str]: return self._action_order def tokenize(self, action: tensorspec_utils.TensorSpecStruct) -> tf.Tensor: """Tokenizes an action.""" action_tokens = [] for k in self._action_order: a = action[k] # a is [batch, actions_size] spec = self._action_spec[k] if spec.dtype == tf.int32: # Int32 actions are already assumed to be tokens, assume it is smaller # than the vocab size, so all we need to do is pad zeros. tf.debugging.assert_equal(1, tf.reduce_sum(a, axis=-1)) # extract the token [batch, 1] token = tf.argmax(a, axis=-1, output_type=tf.int32) tf.debugging.assert_less(token, self._vocab_size) # Add a seq dimension [batch, 1] token = tf.expand_dims(token, axis=-1) else: a = tf.clip_by_value(a, spec.minimum, spec.maximum) # Normalize the action [batch, actions_size] token = (a - spec.minimum) / (spec.maximum - spec.minimum) # Bucket and discretize the action to vocab_size, [batch, actions_size] token = tf.cast(token * (self._vocab_size - 1), tf.int32) action_tokens.append(token) # Append all actions, [batch, all_actions_size] action_tokens = tf.concat(action_tokens, axis=-1) return action_tokens def detokenize(self, action_tokens: tf.Tensor) -> tensorspec_utils.TensorSpecStruct: """Detokenizes an action.""" action = tensorspec_utils.TensorSpecStruct() token_index = 0 for k in self._action_order: spec = self._action_spec[k] action_dim = spec.shape[0] if spec.dtype == tf.int32: # Int32 actions are already assumed to be tokens. action[k] = action_tokens[..., token_index] # A poor model may output tokens outside the allowed range, in that case # set them to a default value, the 0 token in this case. outside_range = tf.greater_equal(action[k], action_dim) action[k] = tf.where(outside_range, tf.zeros_like(action[k]), action[k]) action[k] = tf.one_hot( action[k], depth=action_dim, axis=-1, dtype=tf.int32) token_index += 1 else: actions = [] for _ in range(action_dim): a = action_tokens[..., token_index:token_index + 1] a = tf.cast(a, tf.float32) a = a / (self._vocab_size - 1) a = (a * (spec.maximum - spec.minimum)) + spec.minimum actions.append(a) token_index += 1 action[k] = tf.concat(actions, axis=-1) return action