robotics_transformer/sequence_agent.py

# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Sequence policy and agent that directly output actions via actor network.

These classes are not intended to change as they are generic enough for any
all-neural actor based agent+policy. All new features are intended to be
implemented in `actor_network` and `loss_fn`.
"""
from typing import Optional, Type

from absl import logging
import tensorflow as tf
from tf_agents.agents import data_converter
from tf_agents.agents import tf_agent
from tf_agents.networks import network
from tf_agents.policies import actor_policy
from tf_agents.trajectories import policy_step
from tf_agents.trajectories import time_step as ts
from tf_agents.typing import types
from tf_agents.utils import nest_utils


class SequencePolicy(actor_policy.ActorPolicy):
  """A policy that directly outputs actions via an actor network."""

  def __init__(self, **kwargs):
    self._actions = None
    super().__init__(**kwargs)

  def set_actions(self, actions):
    self._actor_network.set_actions(actions)

  def get_actor_loss(self):
    return self._actor_network.get_actor_loss()

  def get_aux_info(self):
    return self._actor_network.get_aux_info()

  def set_training(self, training):
    self._training = training

  def _action(self,
              time_step: ts.TimeStep,
              policy_state: types.NestedTensor,
              seed: Optional[types.Seed] = None) -> policy_step.PolicyStep:
    del seed
    action, policy_state = self._apply_actor_network(
        time_step.observation,
        step_type=time_step.step_type,
        policy_state=policy_state)
    info = ()
    return policy_step.PolicyStep(action, policy_state, info)

  def _distribution(self, time_step, policy_state):
    current_step = super()._distribution(time_step, policy_state)
    return current_step


class SequenceAgent(tf_agent.TFAgent):
  """A sequence agent that directly outputs actions via an actor network."""

  def __init__(self,
               time_step_spec: ts.TimeStep,
               action_spec: types.NestedTensorSpec,
               actor_network: Type[network.Network],
               actor_optimizer: tf.keras.optimizers.Optimizer,
               policy_cls: Type[actor_policy.ActorPolicy] = SequencePolicy,
               time_sequence_length: int = 6,
               debug_summaries: bool = False,
               **kwargs):
    self._info_spec = ()
    self._actor_network = actor_network(  # pytype: disable=missing-parameter  # dynamic-method-lookup
        input_tensor_spec=time_step_spec.observation,
        output_tensor_spec=action_spec,
        policy_info_spec=self._info_spec,
        train_step_counter=kwargs['train_step_counter'],
        time_sequence_length=time_sequence_length)

    self._actor_optimizer = actor_optimizer
    # Train policy is only used for loss and never exported as saved_model.
    self._train_policy = policy_cls(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        info_spec=self._info_spec,
        actor_network=self._actor_network,
        training=True)
    collect_policy = policy_cls(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        info_spec=self._info_spec,
        actor_network=self._actor_network,
        training=False)
    super(SequenceAgent, self).__init__(
        time_step_spec,
        action_spec,
        collect_policy,  # We use the collect_policy as the eval policy.
        collect_policy,
        train_sequence_length=time_sequence_length,
        **kwargs)
    self._data_context = data_converter.DataContext(
        time_step_spec=time_step_spec,
        action_spec=action_spec,
        info_spec=collect_policy.info_spec,
        use_half_transition=True)
    self.as_transition = data_converter.AsHalfTransition(
        self._data_context, squeeze_time_dim=False)
    self._debug_summaries = debug_summaries

    num_params = 0
    for weight in self._actor_network.trainable_weights:
      weight_params = 1
      for dim in weight.shape:
        weight_params *= dim
      logging.info('%s has %s params.', weight.name, weight_params)
      num_params += weight_params
    logging.info('Actor network has %sM params.', round(num_params / 1000000.,
                                                        2))

  def _train(self, experience: types.NestedTensor,
             weights: types.Tensor) -> tf_agent.LossInfo:
    self.train_step_counter.assign_add(1)
    loss_info = self._loss(experience, weights, training=True)
    self._apply_gradients(loss_info.loss)
    return loss_info

  def _apply_gradients(self, loss: types.Tensor):
    variables = self._actor_network.trainable_weights
    gradients = tf.gradients(loss, variables)
    # Skip nan and inf gradients.
    new_gradients = []
    for g in gradients:
      if g is not None:
        new_g = tf.where(
            tf.math.logical_or(tf.math.is_inf(g), tf.math.is_nan(g)),
            tf.zeros_like(g), g)
        new_gradients.append(new_g)
      else:
        new_gradients.append(g)
    grads_and_vars = list(zip(new_gradients, variables))
    self._actor_optimizer.apply_gradients(grads_and_vars)

  def _loss(self, experience: types.NestedTensor, weights: types.Tensor,
            training: bool) -> tf_agent.LossInfo:
    transition = self.as_transition(experience)
    time_steps, policy_steps, _ = transition
    batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0]
    policy = self._train_policy
    policy.set_actions(policy_steps.action)
    policy.set_training(training=training)
    with tf.name_scope('actor_loss'):
      policy_state = policy.get_initial_state(batch_size)
      policy.action(time_steps, policy_state=policy_state)
      valid_mask = tf.cast(~time_steps.is_last(), tf.float32)
      loss = valid_mask * policy.get_actor_loss()
      loss = tf.reduce_mean(loss)
      policy.set_actions(None)
      self._actor_network.add_summaries(time_steps.observation,
                                        policy.get_aux_info(),
                                        self._debug_summaries, training)
      return tf_agent.LossInfo(loss=loss, extra=loss)
add robotics transformer 2022-12-10 03:58:47 +08:00			`# Copyright 2022 Google LLC`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`"""Sequence policy and agent that directly output actions via actor network.`

			`These classes are not intended to change as they are generic enough for any`
			`all-neural actor based agent+policy. All new features are intended to be`
			implemented in `actor_network` and `loss_fn`.
			`"""`
			`from typing import Optional, Type`

			`from absl import logging`
			`import tensorflow as tf`
			`from tf_agents.agents import data_converter`
			`from tf_agents.agents import tf_agent`
			`from tf_agents.networks import network`
			`from tf_agents.policies import actor_policy`
			`from tf_agents.trajectories import policy_step`
			`from tf_agents.trajectories import time_step as ts`
			`from tf_agents.typing import types`
			`from tf_agents.utils import nest_utils`


			`class SequencePolicy(actor_policy.ActorPolicy):`
			`"""A policy that directly outputs actions via an actor network."""`

			`def __init__(self, **kwargs):`
			`self._actions = None`
			`super().__init__(**kwargs)`

			`def set_actions(self, actions):`
			`self._actor_network.set_actions(actions)`

			`def get_actor_loss(self):`
			`return self._actor_network.get_actor_loss()`

			`def get_aux_info(self):`
			`return self._actor_network.get_aux_info()`

			`def set_training(self, training):`
			`self._training = training`

			`def _action(self,`
			`time_step: ts.TimeStep,`
			`policy_state: types.NestedTensor,`
			`seed: Optional[types.Seed] = None) -> policy_step.PolicyStep:`
			`del seed`
			`action, policy_state = self._apply_actor_network(`
			`time_step.observation,`
			`step_type=time_step.step_type,`
			`policy_state=policy_state)`
			`info = ()`
			`return policy_step.PolicyStep(action, policy_state, info)`

			`def _distribution(self, time_step, policy_state):`
			`current_step = super()._distribution(time_step, policy_state)`
			`return current_step`


			`class SequenceAgent(tf_agent.TFAgent):`
			`"""A sequence agent that directly outputs actions via an actor network."""`

			`def __init__(self,`
			`time_step_spec: ts.TimeStep,`
			`action_spec: types.NestedTensorSpec,`
			`actor_network: Type[network.Network],`
			`actor_optimizer: tf.keras.optimizers.Optimizer,`
			`policy_cls: Type[actor_policy.ActorPolicy] = SequencePolicy,`
			`time_sequence_length: int = 6,`
			`debug_summaries: bool = False,`
			`**kwargs):`
			`self._info_spec = ()`
			`self._actor_network = actor_network( # pytype: disable=missing-parameter # dynamic-method-lookup`
			`input_tensor_spec=time_step_spec.observation,`
			`output_tensor_spec=action_spec,`
			`policy_info_spec=self._info_spec,`
			`train_step_counter=kwargs['train_step_counter'],`
			`time_sequence_length=time_sequence_length)`

			`self._actor_optimizer = actor_optimizer`
			`# Train policy is only used for loss and never exported as saved_model.`
			`self._train_policy = policy_cls(`
			`time_step_spec=time_step_spec,`
			`action_spec=action_spec,`
			`info_spec=self._info_spec,`
			`actor_network=self._actor_network,`
			`training=True)`
			`collect_policy = policy_cls(`
			`time_step_spec=time_step_spec,`
			`action_spec=action_spec,`
			`info_spec=self._info_spec,`
			`actor_network=self._actor_network,`
			`training=False)`
			`super(SequenceAgent, self).__init__(`
			`time_step_spec,`
			`action_spec,`
			`collect_policy, # We use the collect_policy as the eval policy.`
			`collect_policy,`
			`train_sequence_length=time_sequence_length,`
			`**kwargs)`
			`self._data_context = data_converter.DataContext(`
			`time_step_spec=time_step_spec,`
			`action_spec=action_spec,`
			`info_spec=collect_policy.info_spec,`
			`use_half_transition=True)`
			`self.as_transition = data_converter.AsHalfTransition(`
			`self._data_context, squeeze_time_dim=False)`
			`self._debug_summaries = debug_summaries`

			`num_params = 0`
			`for weight in self._actor_network.trainable_weights:`
			`weight_params = 1`
			`for dim in weight.shape:`
			`weight_params *= dim`
			`logging.info('%s has %s params.', weight.name, weight_params)`
			`num_params += weight_params`
			`logging.info('Actor network has %sM params.', round(num_params / 1000000.,`
			`2))`

			`def _train(self, experience: types.NestedTensor,`
			`weights: types.Tensor) -> tf_agent.LossInfo:`
			`self.train_step_counter.assign_add(1)`
			`loss_info = self._loss(experience, weights, training=True)`
			`self._apply_gradients(loss_info.loss)`
			`return loss_info`

			`def _apply_gradients(self, loss: types.Tensor):`
			`variables = self._actor_network.trainable_weights`
			`gradients = tf.gradients(loss, variables)`
			`# Skip nan and inf gradients.`
			`new_gradients = []`
			`for g in gradients:`
			`if g is not None:`
			`new_g = tf.where(`
			`tf.math.logical_or(tf.math.is_inf(g), tf.math.is_nan(g)),`
			`tf.zeros_like(g), g)`
			`new_gradients.append(new_g)`
			`else:`
			`new_gradients.append(g)`
			`grads_and_vars = list(zip(new_gradients, variables))`
			`self._actor_optimizer.apply_gradients(grads_and_vars)`

			`def _loss(self, experience: types.NestedTensor, weights: types.Tensor,`
			`training: bool) -> tf_agent.LossInfo:`
			`transition = self.as_transition(experience)`
			`time_steps, policy_steps, _ = transition`
			`batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0]`
			`policy = self._train_policy`
			`policy.set_actions(policy_steps.action)`
			`policy.set_training(training=training)`
			`with tf.name_scope('actor_loss'):`
			`policy_state = policy.get_initial_state(batch_size)`
			`policy.action(time_steps, policy_state=policy_state)`
			`valid_mask = tf.cast(~time_steps.is_last(), tf.float32)`
			`loss = valid_mask * policy.get_actor_loss()`
			`loss = tf.reduce_mean(loss)`
			`policy.set_actions(None)`
			`self._actor_network.add_summaries(time_steps.observation,`
			`policy.get_aux_info(),`
			`self._debug_summaries, training)`
			`return tf_agent.LossInfo(loss=loss, extra=loss)`