robotics_transformer/film_efficientnet/pretrained_efficientnet_enc...

# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Encoder based on Efficientnet."""

from typing import Optional

import gin
from robotics_transformer.film_efficientnet import film_conditioning_layer
from robotics_transformer.film_efficientnet import film_efficientnet_encoder
import tensorflow as tf

_MODELS = {
    'b3': film_efficientnet_encoder.EfficientNetB3,
}

_SIZES = {
    'b3': 300,
}


@gin.configurable
class EfficientNetEncoder(tf.keras.layers.Layer):
  """Applies a pretrained Efficientnet based encoder."""

  def __init__(self,
               model_variant: str = 'b3',
               freeze: bool = False,
               early_film: bool = True,
               weights: Optional[str] = 'imagenet',
               include_top: bool = False,
               pooling: bool = True,
               **kwargs):
    """Initialize the model.

    Args:
      model_variant: One of 'b0-b7' of the efficient encoders. See
        https://arxiv.org/abs/1905.11946 to understand the variants.
      freeze: Whether or not to freeze the pretrained weights (seems to not work
        well).
      early_film: Whether to inject film layers into the efficientnet encoder
        (seems to be essential to getting strong performance).
      weights: Which pretrained weights to use. Either 'imagenet', a path to the
        pretrained weights, or None for from scratch.
      include_top: Whether to add the top fully connected layer. If True, this
        will cause encoding to fail and is used only for unit testing purposes.
      pooling: If false, returns feature map before global average pooling
      **kwargs: Keras specific layer kwargs.
    """
    super(EfficientNetEncoder, self).__init__(**kwargs)
    if model_variant not in _MODELS:
      raise ValueError(f'Unknown variant {model_variant}')
    self.model_variant = model_variant
    self.early_film = early_film
    self.freeze = freeze
    self.conv1x1 = tf.keras.layers.Conv2D(
        filters=512,
        kernel_size=(1, 1),
        strides=(1, 1),
        padding='SAME',
        use_bias=False,
        kernel_initializer=tf.keras.initializers.VarianceScaling())
    self.net = _MODELS[model_variant](
        include_top=include_top,
        weights=weights,
        include_film=early_film,
    )
    self.film_layer = film_conditioning_layer.FilmConditioning(num_channels=512)
    self._pooling = pooling

  def _prepare_image(self, image: tf.Tensor) -> tf.Tensor:
    """Resize the input image and check that the range is correct."""
    if len(image.shape) != 4 or image.shape[-1] != 3:
      raise ValueError('Provided image should have shape (b, h, w, 3).')
    size = _SIZES[self.model_variant]
    if image.shape[1] < size / 4 or image.shape[2] < size / 4:
      raise ValueError('Provided image is too small.')
    if image.shape[1] > size * 4 or image.shape[2] > size * 4:
      raise ValueError('Provided image is too large.')
    image = tf.image.resize(image, (size, size))
    c1 = tf.Assert(tf.reduce_max(image) <= 1, data=[tf.reduce_max(image)])
    c2 = tf.Assert(tf.reduce_min(image) >= 0, data=[tf.reduce_min(image)])
    with tf.control_dependencies([c1, c2]):
      image *= 255  # The image is expected to be in range(0, 255).
      image = film_efficientnet_encoder.preprocess_input(image)
      return image

  def _encode(self, image: tf.Tensor, context: tf.Tensor,
              training: bool) -> tf.Tensor:
    """Run the image through the efficientnet encoder."""
    image = self._prepare_image(image)
    if self.early_film:
      return self.net((image, context), training=training)
    return self.net(image, training=training)

  def call(self,
           image: tf.Tensor,
           context: Optional[tf.Tensor] = None,
           training: bool = True) -> tf.Tensor:
    if self.freeze:
      features = tf.stop_gradient(self._encode(image, context, training))
    else:
      features = self._encode(image, context, training)
    if context is not None:
      features = self.conv1x1(features)
      features = self.film_layer(features, context)

    if not self._pooling:
      return features

    # Global average pool.
    return tf.reduce_mean(features, [1, 2])
add robotics transformer 2022-12-10 03:58:47 +08:00			`# Copyright 2022 Google LLC`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`"""Encoder based on Efficientnet."""`

			`from typing import Optional`

			`import gin`
			`from robotics_transformer.film_efficientnet import film_conditioning_layer`
			`from robotics_transformer.film_efficientnet import film_efficientnet_encoder`
			`import tensorflow as tf`

			`_MODELS = {`
			`'b3': film_efficientnet_encoder.EfficientNetB3,`
			`}`

			`_SIZES = {`
			`'b3': 300,`
			`}`


			`@gin.configurable`
			`class EfficientNetEncoder(tf.keras.layers.Layer):`
			`"""Applies a pretrained Efficientnet based encoder."""`

			`def __init__(self,`
			`model_variant: str = 'b3',`
			`freeze: bool = False,`
			`early_film: bool = True,`
			`weights: Optional[str] = 'imagenet',`
			`include_top: bool = False,`
			`pooling: bool = True,`
			`**kwargs):`
			`"""Initialize the model.`

			`Args:`
			`model_variant: One of 'b0-b7' of the efficient encoders. See`
			`https://arxiv.org/abs/1905.11946 to understand the variants.`
			`freeze: Whether or not to freeze the pretrained weights (seems to not work`
			`well).`
			`early_film: Whether to inject film layers into the efficientnet encoder`
			`(seems to be essential to getting strong performance).`
			`weights: Which pretrained weights to use. Either 'imagenet', a path to the`
			`pretrained weights, or None for from scratch.`
			`include_top: Whether to add the top fully connected layer. If True, this`
			`will cause encoding to fail and is used only for unit testing purposes.`
			`pooling: If false, returns feature map before global average pooling`
			`**kwargs: Keras specific layer kwargs.`
			`"""`
			`super(EfficientNetEncoder, self).__init__(**kwargs)`
			`if model_variant not in _MODELS:`
			`raise ValueError(f'Unknown variant {model_variant}')`
			`self.model_variant = model_variant`
			`self.early_film = early_film`
			`self.freeze = freeze`
			`self.conv1x1 = tf.keras.layers.Conv2D(`
			`filters=512,`
			`kernel_size=(1, 1),`
			`strides=(1, 1),`
			`padding='SAME',`
			`use_bias=False,`
			`kernel_initializer=tf.keras.initializers.VarianceScaling())`
			`self.net = _MODELS[model_variant](`
			`include_top=include_top,`
			`weights=weights,`
			`include_film=early_film,`
			`)`
			`self.film_layer = film_conditioning_layer.FilmConditioning(num_channels=512)`
			`self._pooling = pooling`

			`def _prepare_image(self, image: tf.Tensor) -> tf.Tensor:`
			`"""Resize the input image and check that the range is correct."""`
			`if len(image.shape) != 4 or image.shape[-1] != 3:`
			`raise ValueError('Provided image should have shape (b, h, w, 3).')`
			`size = _SIZES[self.model_variant]`
			`if image.shape[1] < size / 4 or image.shape[2] < size / 4:`
			`raise ValueError('Provided image is too small.')`
			`if image.shape[1] > size * 4 or image.shape[2] > size * 4:`
			`raise ValueError('Provided image is too large.')`
			`image = tf.image.resize(image, (size, size))`
			`c1 = tf.Assert(tf.reduce_max(image) <= 1, data=[tf.reduce_max(image)])`
			`c2 = tf.Assert(tf.reduce_min(image) >= 0, data=[tf.reduce_min(image)])`
			`with tf.control_dependencies([c1, c2]):`
			`image *= 255 # The image is expected to be in range(0, 255).`
			`image = film_efficientnet_encoder.preprocess_input(image)`
			`return image`

			`def _encode(self, image: tf.Tensor, context: tf.Tensor,`
			`training: bool) -> tf.Tensor:`
			`"""Run the image through the efficientnet encoder."""`
			`image = self._prepare_image(image)`
			`if self.early_film:`
			`return self.net((image, context), training=training)`
			`return self.net(image, training=training)`

			`def call(self,`
			`image: tf.Tensor,`
			`context: Optional[tf.Tensor] = None,`
			`training: bool = True) -> tf.Tensor:`
			`if self.freeze:`
			`features = tf.stop_gradient(self._encode(image, context, training))`
			`else:`
			`features = self._encode(image, context, training)`
			`if context is not None:`
			`features = self.conv1x1(features)`
			`features = self.film_layer(features, context)`

			`if not self._pooling:`
			`return features`

			`# Global average pool.`
			`return tf.reduce_mean(features, [1, 2])`