robotics_transformer/tokenizers/action_tokenizer.py

158 lines
6.3 KiB
Python

# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A simple action tokenizer used with Robotics Transformer 1.
As an example, if an action is:
terminate = [0, 1]
world_vector = [0.9, 0.8, -0.3]
rotation_delta = [-0.1, 0.2, .6]
gripper_closedness = 0.9
Then we build a sequence of tokens of length 8 [one for each dimension].
The int32 type action dimensions are already assumed discrete and tokenized,
the float dimensions are bucketed according to the specs min and max. Each
dimension has 'vocab_size' buckets.
Currently, this tokenizer assumes one action spec and it is highly recommended
to specify the 'action_order', eg [terminate, world_vector, rotation_delta,
gripper_closedness]. Since after tokenization you lose that information, this
will be useful for debugging. Actions may also be subselected for prediction,
since not all actions are needed in the action_order.
"""
from typing import Optional
from tensor2robot.utils import tensorspec_utils
import tensorflow as tf
class RT1ActionTokenizer:
"""Tokenizes based on vocab size."""
def __init__(self,
action_spec: tensorspec_utils.TensorSpecStruct,
vocab_size: int,
action_order: Optional[list[str]] = None):
"""Instantiates an RT1ActionTokenizer.
Args:
action_spec: Tensor spec of the expected action tensor.
vocab_size: Number of buckets to discretize action to.
action_order: Order of the action names, used to discern the order of
tokenized actions to detokenize and assemble back to action tensor
"""
self._action_spec = action_spec
self._vocab_size = vocab_size
if action_order is None:
self._action_order = self._action_spec.keys()
else:
for action in action_order:
if action not in self._action_spec.keys():
raise ValueError('actions: %s not found in action_spec: %s' %
(action, action_spec.keys()))
assert action in self._action_spec.keys()
self._action_order = action_order
self._tokens_per_action = 0
for action in self._action_order:
action_shape = self._action_spec[action].shape
if len(action_shape) != 1:
raise ValueError(
'Only action shapes with single dimension supported, got %s' %
action_shape)
if self._action_spec[action].dtype == tf.int32:
# Int32 actions are already assumed to be tokens.
self._tokens_per_action += 1
else:
self._tokens_per_action += action_shape[0]
# We measure # of action tokens in two different way. One is by checking
# from action_order (above) and the other is by looping through the
# action spec (below). We aseert the # of action tokens are the same
# calculated by these two ways. This will assure action_order is correctly
# configured, otherwise, it will through an error in the assert.
num_action_token = 0
for spec in self._action_spec.values():
if spec.dtype == tf.int32:
num_action_token += 1
else:
num_action_token += spec.shape[-1]
tf.debugging.assert_equal(num_action_token, self._tokens_per_action)
@property
def tokens_per_action(self) -> int:
return self._tokens_per_action
@property
def action_spec(self) -> tensorspec_utils.TensorSpecStruct:
return self._action_spec
@property
def action_order(self) -> list[str]:
return self._action_order
def tokenize(self, action: tensorspec_utils.TensorSpecStruct) -> tf.Tensor:
"""Tokenizes an action."""
action_tokens = []
for k in self._action_order:
a = action[k] # a is [batch, actions_size]
spec = self._action_spec[k]
if spec.dtype == tf.int32:
# Int32 actions are already assumed to be tokens, assume it is smaller
# than the vocab size, so all we need to do is pad zeros.
tf.debugging.assert_equal(1, tf.reduce_sum(a, axis=-1))
# extract the token [batch, 1]
token = tf.argmax(a, axis=-1, output_type=tf.int32)
tf.debugging.assert_less(token, self._vocab_size)
# Add a seq dimension [batch, 1]
token = tf.expand_dims(token, axis=-1)
else:
a = tf.clip_by_value(a, spec.minimum, spec.maximum)
# Normalize the action [batch, actions_size]
token = (a - spec.minimum) / (spec.maximum - spec.minimum)
# Bucket and discretize the action to vocab_size, [batch, actions_size]
token = tf.cast(token * (self._vocab_size - 1), tf.int32)
action_tokens.append(token)
# Append all actions, [batch, all_actions_size]
action_tokens = tf.concat(action_tokens, axis=-1)
return action_tokens
def detokenize(self,
action_tokens: tf.Tensor) -> tensorspec_utils.TensorSpecStruct:
"""Detokenizes an action."""
action = tensorspec_utils.TensorSpecStruct()
token_index = 0
for k in self._action_order:
spec = self._action_spec[k]
action_dim = spec.shape[0]
if spec.dtype == tf.int32:
# Int32 actions are already assumed to be tokens.
action[k] = action_tokens[..., token_index]
# A poor model may output tokens outside the allowed range, in that case
# set them to a default value, the 0 token in this case.
outside_range = tf.greater_equal(action[k], action_dim)
action[k] = tf.where(outside_range, tf.zeros_like(action[k]), action[k])
action[k] = tf.one_hot(
action[k], depth=action_dim, axis=-1, dtype=tf.int32)
token_index += 1
else:
actions = []
for _ in range(action_dim):
a = action_tokens[..., token_index:token_index + 1]
a = tf.cast(a, tf.float32)
a = a / (self._vocab_size - 1)
a = (a * (spec.maximum - spec.minimum)) + spec.minimum
actions.append(a)
token_index += 1
action[k] = tf.concat(actions, axis=-1)
return action