robotics_transformer/transformer_network_test_se...

# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for networks."""

import copy
from typing import Optional, Tuple, Union

from absl.testing import parameterized
import numpy as np
from robotics_transformer import sequence_agent
from robotics_transformer import transformer_network
from tensor2robot.utils import tensorspec_utils
import tensorflow as tf
from tf_agents.specs import tensor_spec
from tf_agents.trajectories import time_step as ts

BATCH_SIZE = 2
TIME_SEQUENCE_LENGTH = 3
HEIGHT = 256
WIDTH = 320
NUM_IMAGE_TOKENS = 2


def spec_names_list() -> list[str]:
  """Lists the different types of specs accepted by the transformer."""
  return ['default']


def state_spec_list() -> list[tensorspec_utils.TensorSpecStruct]:
  """Lists the different types of state spec accepted by the transformer."""
  state_spec = tensorspec_utils.TensorSpecStruct()
  state_spec.image = tensor_spec.BoundedTensorSpec([HEIGHT, WIDTH, 3],
                                                   dtype=tf.float32,
                                                   name='image',
                                                   minimum=0.,
                                                   maximum=1.)
  state_spec.natural_language_embedding = tensor_spec.TensorSpec(
      shape=[512], dtype=tf.float32, name='natural_language_embedding')

  state_spec_mask = copy.deepcopy(state_spec)
  state_spec_mask.initial_binary_mask = tensor_spec.BoundedTensorSpec(
      [HEIGHT, WIDTH, 1],
      dtype=tf.int32,
      name='initial_binary_mask',
      minimum=0,
      maximum=255)

  state_spec_tcl = copy.deepcopy(state_spec)
  state_spec_tcl.original_image = tensor_spec.BoundedTensorSpec(
      [HEIGHT, WIDTH, 3],
      dtype=tf.float32,
      name='original_image',
      minimum=0.,
      maximum=1.)

  return [
      state_spec,
      state_spec_mask,
      state_spec_tcl,
  ]


def observations_list(training: bool = True) -> list[dict[str, tf.Tensor]]:
  """Lists the different types of observations accepted by the transformer."""
  if training:
    image_shape = [BATCH_SIZE, TIME_SEQUENCE_LENGTH, HEIGHT, WIDTH, 3]
    emb_shape = [BATCH_SIZE, TIME_SEQUENCE_LENGTH, 512]
    mask_shape = [BATCH_SIZE, TIME_SEQUENCE_LENGTH, HEIGHT, WIDTH, 1]
  else:
    # inference currently only support batch size of 1
    image_shape = [1, HEIGHT, WIDTH, 3]
    emb_shape = [1, 512]
    mask_shape = [1, HEIGHT, WIDTH, 1]
  return [
      {
          'image': tf.constant(0.5, shape=image_shape),
          'natural_language_embedding': tf.constant(1., shape=emb_shape),
      },
      {
          'image': tf.constant(0.5, shape=image_shape),
          'natural_language_embedding': tf.constant(1., shape=emb_shape),
          'initial_binary_mask': tf.constant(192, shape=mask_shape),
      },
      {  # This is used for TCL.
          'image': tf.constant(0.5, shape=image_shape),
          'original_image': tf.constant(0.4, shape=image_shape),
          'natural_language_embedding': tf.constant(1., shape=emb_shape),
      },
  ]


NAME_TO_STATE_SPECS = dict(zip(spec_names_list(), state_spec_list()))
NAME_TO_OBSERVATIONS = dict(zip(spec_names_list(), observations_list()))
NAME_TO_INF_OBSERVATIONS = dict(
    zip(spec_names_list(), observations_list(False)))


class FakeImageTokenizer(tf.keras.layers.Layer):
  """Fake Image Tokenizer for testing Transformer."""

  def __init__(self,
               encoder: ...,
               position_embedding: ...,
               embedding_output_dim: int,
               patch_size: int,
               use_token_learner: bool = False,
               num_tokens: int = NUM_IMAGE_TOKENS,
               use_initial_binary_mask: bool = False,
               **kwargs):
    del encoder, position_embedding, patch_size, use_token_learner
    super().__init__(**kwargs)
    self.tokens_per_context_image = num_tokens
    if use_initial_binary_mask:
      self.tokens_per_context_image += 1
    self.embedding_output_dim = embedding_output_dim
    self.use_initial_binary_mask = use_initial_binary_mask

  def __call__(self,
               image: tf.Tensor,
               context: Optional[tf.Tensor] = None,
               initial_binary_mask: Optional[tf.Tensor] = None,
               training: bool = False) -> tf.Tensor:
    if self.use_initial_binary_mask:
      assert initial_binary_mask is not None
    image_shape = tf.shape(image)
    seq_size = image_shape[1]
    batch_size = image_shape[0]
    all_tokens = []
    num_tokens = self.tokens_per_context_image
    for t in range(seq_size):
      tokens = tf.ones([batch_size, 1, num_tokens, self.embedding_output_dim
                       ]) * image[0][t][0][0]
      all_tokens.append(tokens)
    return tf.concat(all_tokens, axis=1)


class TransformerNetworkTestUtils(tf.test.TestCase, parameterized.TestCase):
  """Defines specs, SequenceAgent, and various other testing utilities."""

  def _define_specs(self,
                    train_batch_size=BATCH_SIZE,
                    inference_batch_size=1,
                    time_sequence_length=TIME_SEQUENCE_LENGTH,
                    inference_sequence_length=TIME_SEQUENCE_LENGTH,
                    token_embedding_size=512,
                    image_width=WIDTH,
                    image_height=HEIGHT):
    """Defines specs and observations (both training and inference)."""
    self.train_batch_size = train_batch_size
    self.inference_batch_size = inference_batch_size
    self.time_sequence_length = time_sequence_length
    self.inference_sequence_length = inference_sequence_length
    self.token_embedding_size = token_embedding_size
    action_spec = tensorspec_utils.TensorSpecStruct()
    action_spec.world_vector = tensor_spec.BoundedTensorSpec(
        (3,), dtype=tf.float32, minimum=-1., maximum=1., name='world_vector')

    action_spec.rotation_delta = tensor_spec.BoundedTensorSpec(
        (3,),
        dtype=tf.float32,
        minimum=-np.pi / 2,
        maximum=np.pi / 2,
        name='rotation_delta')

    action_spec.gripper_closedness_action = tensor_spec.BoundedTensorSpec(
        (1,),
        dtype=tf.float32,
        minimum=-1.,
        maximum=1.,
        name='gripper_closedness_action')
    action_spec.terminate_episode = tensor_spec.BoundedTensorSpec(
        (2,), dtype=tf.int32, minimum=0, maximum=1, name='terminate_episode')

    state_spec = tensorspec_utils.TensorSpecStruct()
    state_spec.image = tensor_spec.BoundedTensorSpec(
        [image_height, image_width, 3],
        dtype=tf.float32,
        name='image',
        minimum=0.,
        maximum=1.)
    state_spec.natural_language_embedding = tensor_spec.TensorSpec(
        shape=[self.token_embedding_size],
        dtype=tf.float32,
        name='natural_language_embedding')
    self._policy_info_spec = {
        'return':
            tensor_spec.BoundedTensorSpec((),
                                          dtype=tf.float32,
                                          minimum=0.0,
                                          maximum=1.0,
                                          name='return'),
        'discounted_return':
            tensor_spec.BoundedTensorSpec((),
                                          dtype=tf.float32,
                                          minimum=0.0,
                                          maximum=1.0,
                                          name='discounted_return'),
    }

    self._state_spec = state_spec
    self._action_spec = action_spec

    self._inference_observation = {
        'image':
            tf.constant(
                1,
                shape=[self.inference_batch_size, image_height, image_width, 3],
                dtype=tf.dtypes.float32),
        'natural_language_embedding':
            tf.constant(
                1.,
                shape=[self.inference_batch_size, self.token_embedding_size],
                dtype=tf.dtypes.float32),
    }
    self._train_observation = {
        'image':
            tf.constant(
                0.5,
                shape=[
                    self.train_batch_size, self.time_sequence_length,
                    image_height, image_width, 3
                ]),
        'natural_language_embedding':
            tf.constant(
                1.,
                shape=[
                    self.train_batch_size, self.time_sequence_length,
                    self.token_embedding_size
                ]),
    }
    self._inference_action = {
        'world_vector':
            tf.constant(0.5, shape=[self.inference_batch_size, 3]),
        'rotation_delta':
            tf.constant(0.5, shape=[self.inference_batch_size, 3]),
        'terminate_episode':
            tf.constant(
                [0, 1] * self.inference_batch_size,
                shape=[self.inference_batch_size, 2]),
        'gripper_closedness_action':
            tf.constant(0.5, shape=[self.inference_batch_size, 1]),
    }
    self._train_action = {
        'world_vector':
            tf.constant(
                0.5,
                shape=[self.train_batch_size, self.time_sequence_length, 3]),
        'rotation_delta':
            tf.constant(
                0.5,
                shape=[self.train_batch_size, self.time_sequence_length, 3]),
        'terminate_episode':
            tf.constant(
                [0, 1] * self.train_batch_size * self.time_sequence_length,
                shape=[self.train_batch_size, self.time_sequence_length, 2]),
        'gripper_closedness_action':
            tf.constant(
                0.5,
                shape=[self.train_batch_size, self.time_sequence_length, 1]),
    }

  def _create_agent(self, actor_network=None):
    """Creates SequenceAgent using custom actor_network."""
    time_step_spec = ts.time_step_spec(observation_spec=self._state_spec)
    if actor_network is None:
      actor_network = transformer_network.TransformerNetwork

    self._agent = sequence_agent.SequenceAgent(
        time_step_spec=time_step_spec,
        action_spec=self._action_spec,
        actor_network=actor_network,
        actor_optimizer=tf.keras.optimizers.Adam(),
        train_step_counter=tf.compat.v1.train.get_or_create_global_step(),
        time_sequence_length=TIME_SEQUENCE_LENGTH)
    self._num_action_tokens = (
        # pylint:disable=protected-access
        self._agent._actor_network._action_tokenizer._tokens_per_action)
    # pylint:enable=protected-access

  def setUp(self):
    self._define_specs()
    super().setUp()

  def get_image_value(self, step_idx: int) -> float:
    return float(step_idx) / self.time_sequence_length

  def get_action_logits(self, batch_size: int, value: int,
                        vocab_size: int) -> tf.Tensor:
    return tf.broadcast_to(
        tf.one_hot(value % vocab_size, vocab_size)[tf.newaxis, tf.newaxis, :],
        [batch_size, 1, vocab_size])

  def create_obs(self, value) -> dict[str, tf.Tensor]:
    observations = {}
    observations['image'] = value * self._inference_observation['image']
    observations[
        'natural_language_embedding'] = value * self._inference_observation[
            'natural_language_embedding']
    return observations

  def fake_action_token_emb(self, action_tokens) -> tf.Tensor:
    """Just pad with zeros."""
    shape = action_tokens.shape
    assert self.vocab_size > self.token_embedding_size
    assert len(shape) == 4
    return action_tokens[:, :, :, :self.token_embedding_size]

  def fake_transformer(
      self, all_tokens, training,
      attention_mask) -> Union[tf.Tensor, Tuple[tf.Tensor, list[tf.Tensor]]]:
    """Fakes the call to TransformerNetwork._transformer."""
    del training
    del attention_mask
    # We expect ST00 ST01 A00 A01...
    # Where:
    # * ST01 is token 1 of state 0.
    # * A01 is token 1 of action 0.
    shape = all_tokens.shape.as_list()
    batch_size = shape[0]
    self.assertEqual(batch_size, 1)
    emb_size = self.token_embedding_size

    # transform to [batch_size, num_tokens, token_size]
    all_tokens = tf.reshape(all_tokens, [batch_size, -1, emb_size])
    # Pads tokens to be of vocab_size.
    self.assertGreater(self.vocab_size, self.token_embedding_size)
    all_shape = all_tokens.shape
    self.assertLen(all_shape.as_list(), 3)
    output_tokens = tf.concat([
        all_tokens,
        tf.zeros([
            all_shape[0], all_shape[1],
            self.vocab_size - self.token_embedding_size
        ])
    ],
                              axis=-1)
    num_tokens_per_step = NUM_IMAGE_TOKENS + self._num_action_tokens
    # Check state/action alignment.
    window_range = min(self._step_idx + 1, self.time_sequence_length)
    for j in range(window_range):
      # The index step that is stored in j = 0.
      first_step_idx = max(0, self._step_idx + 1 - self.time_sequence_length)
      image_idx = j * num_tokens_per_step
      action_start_index = image_idx + NUM_IMAGE_TOKENS
      for t in range(NUM_IMAGE_TOKENS):
        self.assertAllEqual(
            self.get_image_value(first_step_idx + j) *
            tf.ones_like(all_tokens[0][image_idx][:self.token_embedding_size]),
            all_tokens[0][image_idx + t][:self.token_embedding_size])
      # if j is not the current step in the window, all action dimensions
      # from previous steps are already infered and thus can be checked.
      action_dims_range = self.action_inf_idx if j == window_range - 1 else self._num_action_tokens
      for t in range(action_dims_range):
        token_idx = action_start_index + t
        action_value = (first_step_idx + j) * self._num_action_tokens + t
        self.assertAllEqual(
            self.get_action_logits(
                batch_size=batch_size,
                value=action_value,
                vocab_size=self.vocab_size)[0][0][:self.token_embedding_size],
            all_tokens[0][token_idx][:self.token_embedding_size])
    # Output the right action dimension value.
    image_token_index = (
        min(self._step_idx, self.time_sequence_length - 1) *
        num_tokens_per_step)
    transformer_shift = -1
    action_index = (
        image_token_index + NUM_IMAGE_TOKENS + self.action_inf_idx +
        transformer_shift)
    action_value = self._step_idx * self._num_action_tokens + self.action_inf_idx
    action_logits = self.get_action_logits(
        batch_size=batch_size, value=action_value, vocab_size=self.vocab_size)
    output_tokens = tf.concat([
        output_tokens[:, :action_index, :], action_logits[:, :, :],
        output_tokens[:, action_index + 1:, :]
    ],
                              axis=1)
    self.action_inf_idx = (self.action_inf_idx + 1) % self._num_action_tokens
    attention_scores = []
    return output_tokens, attention_scores