Source code for keras_aug.layers.augmentation.geometry.random_zoom_and_crop

import tensorflow as tf
from keras_cv import bounding_box
from keras_cv.utils import preprocessing as preprocessing_utils
from tensorflow import keras

from keras_aug.layers.base.vectorized_base_random_layer import (
    VectorizedBaseRandomLayer,
)
from keras_aug.utils import augmentation as augmentation_utils
from keras_aug.utils import bounding_box as bounding_box_utils


[docs]@keras.utils.register_keras_serializable(package="keras_aug")
class RandomZoomAndCrop(VectorizedBaseRandomLayer):
    """RandomZoomAndCrop implements resize with scale distortion.

    RandomZoomAndCrop takes a three-step approach to size-distortion based image
    augmentation. This technique is specifically tuned for object detection
    pipelines. The layer takes an input of images and bounding boxes, both of
    which may be ragged. It outputs a dense image tensor, ready to feed to a
    model for training. As such this layer will commonly be the final step in an
    augmentation pipeline.

    The augmentation process is as follows:
    The image is first scaled according to a randomly sampled scale factor. The
    width and height of the image are then resized according to the sampled
    scale. This is done to introduce noise into the local scale of features in
    the image. A subset of the image is then cropped randomly according to
    ``(crop_height, crop_width)``. This crop is then padded to be
    ``(height, width)``. Bounding boxes are translated and scaled according to
    the random scaling and random cropping.

    Args:
        height (int): The height of result image.
        width (int): The width of result image.
        scale_factor (float|Sequence[float]|keras_aug.FactorSampler): The range
            of the scale factor that is used to scale the input image. When
            represented as a single float, the factor will be picked between
            ``[1.0 - lower, 1.0 + upper]``. To reproduce the results of the
            MaskRCNN paper pass ``(0.8, 1.25)``.
        crop_height (int, optional): The height of the image to crop from the
            scaled image. Defaults to ``height`` when not provided.
        crop_width (int, optional): The width of the image to crop from the
            scaled image. Defaults to ``width`` when not provided.
        interpolation (str, optional): The interpolation method. Supported values:
            ``"nearest", "bilinear", "bicubic", "area", "lanczos3", "lanczos5", "gaussian", "mitchellcubic"``.
            Defaults to ``"bilinear"``.
        antialias (bool, optional): Whether to use antialias. Defaults to
            ``False``.
        position (str, optional): The padding method.
            Supported values: ``"center", "top_left", "top_right", "bottom_left", "bottom_right", "random"``.
            Defaults to ``"center"``.
        padding_value (int|float, optional): The padding value.
            Defaults to ``0``.
        bounding_box_format (str, optional): The format of bounding
            boxes of input dataset. Refer
            https://github.com/keras-team/keras-cv/blob/master/keras_cv/bounding_box/converters.py
            for more details on supported bounding box formats.
        seed (int|float, optional): The random seed. Defaults to ``None``.

    References:
        - `KerasCV <https://github.com/keras-team/keras-cv>`_
    """  # noqa: E501

    def __init__(
        self,
        height,
        width,
        scale_factor,
        crop_height=None,
        crop_width=None,
        interpolation="bilinear",
        antialias=False,
        position="center",
        padding_value=0,
        bounding_box_format=None,
        seed=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        if not isinstance(height, int) or not isinstance(width, int):
            raise ValueError(
                "RandomZoomAndCrop() expects ``height`` and ``width`` to be "
                f"integers. Received ``height={height}, width={width}``"
            )
        self.height = height
        self.width = width
        self.crop_height = crop_height or height
        self.crop_width = crop_width or width
        self.scale_factor = augmentation_utils.parse_factor(
            scale_factor,
            min_value=0.0,
            max_value=None,
            center_value=1.0,
            seed=seed,
        )
        self.interpolation = preprocessing_utils.get_interpolation(
            interpolation
        )
        self.antialias = antialias
        self.position = augmentation_utils.get_padding_position(position)
        self.padding_value = padding_value
        self.bounding_box_format = bounding_box_format
        self.seed = seed

        crop_size = tf.expand_dims(
            tf.stack([self.crop_height, self.crop_width]), axis=0
        )
        self.crop_size = tf.cast(crop_size, dtype=tf.float32)
        self.force_output_dense_images = True

    def compute_ragged_image_signature(self, images):
        ragged_spec = tf.RaggedTensorSpec(
            shape=(self.height, self.width, images.shape[-1]),
            ragged_rank=1,
            dtype=self.compute_dtype,
        )
        return ragged_spec

    def get_random_transformation_batch(
        self, batch_size, images=None, **kwargs
    ):
        # cast to float32 to avoid numerical issue
        heights, widths = augmentation_utils.get_images_shape(
            images, dtype=tf.float32
        )

        # image_scales
        image_shapes = tf.concat((heights, widths), axis=-1)
        scaled_sizes = tf.round(
            image_shapes
            * self.scale_factor(shape=(batch_size, 1), dtype=tf.float32)
        )
        scales = tf.where(
            tf.less(
                scaled_sizes[..., 0:1] / image_shapes[..., 0:1],
                scaled_sizes[..., 1:] / image_shapes[..., 1:],
            ),
            scaled_sizes[..., 0:1] / image_shapes[..., 0:1],
            scaled_sizes[..., 1:] / image_shapes[..., 1:],
        )
        scaled_sizes = tf.round(image_shapes * scales)
        image_scales = scaled_sizes / image_shapes

        # offsets
        max_offsets = scaled_sizes - self.crop_size
        max_offsets = tf.where(
            tf.less(max_offsets, 0), tf.zeros_like(max_offsets), max_offsets
        )
        offsets = max_offsets * self._random_generator.random_uniform(
            shape=(batch_size, 2), minval=0, maxval=1, dtype=tf.float32
        )
        offsets = tf.cast(offsets, dtype=tf.int32)

        # paddings
        new_heights = tf.cast(scaled_sizes[..., 0:1], dtype=tf.int32)
        new_widths = tf.cast(scaled_sizes[..., 1:], dtype=tf.int32)
        tops = tf.where(
            new_heights < self.crop_height,
            tf.cast((self.crop_height - new_heights) / 2, tf.int32),
            0,
        )
        bottoms = tf.where(
            new_heights < self.crop_height,
            self.crop_height - new_heights - tops,
            0,
        )
        lefts = tf.where(
            new_widths < self.crop_width,
            tf.cast((self.crop_width - new_widths) / 2, tf.int32),
            0,
        )
        rights = tf.where(
            new_widths < self.crop_width,
            self.crop_width - new_widths - lefts,
            0,
        )
        (tops, bottoms, lefts, rights) = augmentation_utils.get_position_params(
            tops, bottoms, lefts, rights, self.position, self._random_generator
        )
        paddings = tf.concat([tops, bottoms, lefts, rights], axis=-1)

        return {
            "image_scales": image_scales,
            "scaled_sizes": scaled_sizes,
            "offsets": offsets,
            "paddings": paddings,
        }

    def augment_ragged_image(self, image, transformation, **kwargs):
        image = tf.expand_dims(image, axis=0)
        transformation = augmentation_utils.expand_dict_dims(
            transformation, axis=0
        )
        image = self.augment_images(
            images=image, transformations=transformation, **kwargs
        )
        return tf.squeeze(image, axis=0)

    def augment_images(self, images, transformations, **kwargs):
        # tf.image.resize always output tf.float32 unless interpolation==nearest
        inputs_for_resize_and_crop_single_image = {
            "images": tf.cast(images, dtype=tf.float32),
            "scaled_sizes": transformations["scaled_sizes"],
            "offsets": transformations["offsets"],
            "paddings": transformations["paddings"],
        }
        images = tf.map_fn(
            self.resize_and_crop_single_image,
            inputs_for_resize_and_crop_single_image,
            fn_output_signature=tf.float32,
        )
        images = tf.ensure_shape(
            images, shape=(None, self.height, self.width, None)
        )
        return tf.cast(images, self.compute_dtype)

    def augment_labels(self, labels, transformations, **kwargs):
        return labels

    def augment_bounding_boxes(
        self,
        bounding_boxes,
        transformations,
        images=None,
        raw_images=None,
        **kwargs,
    ):
        if self.bounding_box_format is None:
            raise ValueError(
                "`RandomZoomAndCrop()` was called with bounding boxes,"
                "but no `bounding_box_format` was specified in the constructor."
                "Please specify a bounding box format in the constructor. i.e."
                "`RandomZoomAndCrop(..., bounding_box_format='xyxy')`"
            )
        # cast to float32 to avoid numerical issue
        bounding_boxes = bounding_box.to_dense(bounding_boxes)
        bounding_boxes = bounding_box.convert_format(
            bounding_boxes,
            source=self.bounding_box_format,
            target="yxyx",
            images=raw_images,
            dtype=tf.float32,
        )

        image_scales = tf.cast(
            transformations["image_scales"], dtype=tf.float32
        )
        offsets = tf.cast(transformations["offsets"], dtype=tf.float32)
        paddings = tf.cast(transformations["paddings"], dtype=tf.float32)
        padding_offsets = tf.concat(
            [paddings[..., 0:1], paddings[..., 2:3]], axis=-1
        )

        # Adjusts box coordinates based on image_scale and offset.
        bounding_boxes = bounding_boxes.copy()
        yxyx = bounding_boxes["boxes"]
        yxyx *= tf.tile(image_scales, [1, 2])[..., tf.newaxis, :]
        yxyx -= tf.tile(offsets, [1, 2])[..., tf.newaxis, :]
        yxyx += tf.tile(padding_offsets, [1, 2])[..., tf.newaxis, :]

        bounding_boxes["boxes"] = yxyx
        bounding_boxes = bounding_box_utils.clip_to_image(
            bounding_boxes,
            bounding_box_format="yxyx",
            images=images,
        )
        bounding_boxes = bounding_box.convert_format(
            bounding_boxes,
            source="yxyx",
            target=self.bounding_box_format,
            images=images,
            dtype=self.compute_dtype,
        )
        return bounding_boxes

    def compute_ragged_segmentation_mask_signature(self, segmentation_masks):
        return tf.RaggedTensorSpec(
            shape=(self.height, self.width, segmentation_masks.shape[-1]),
            ragged_rank=1,
            dtype=self.compute_dtype,
        )

    def augment_ragged_segmentation_mask(
        self, segmentation_mask, transformation, **kwargs
    ):
        segmentation_mask = tf.expand_dims(segmentation_mask, axis=0)
        transformation = augmentation_utils.expand_dict_dims(
            transformation, axis=0
        )
        segmentation_mask = self.augment_segmentation_masks(
            segmentation_masks=segmentation_mask,
            transformations=transformation,
            **kwargs,
        )
        return tf.squeeze(segmentation_mask, axis=0)

    def augment_segmentation_masks(
        self, segmentation_masks, transformations, **kwargs
    ):
        # unpackage augmentation arguments
        inputs_for_resize_and_crop_single_segmentation_mask = {
            "segmentation_masks": segmentation_masks,
            "scaled_sizes": transformations["scaled_sizes"],
            "offsets": transformations["offsets"],
            "paddings": transformations["paddings"],
        }
        segmentation_masks = tf.map_fn(
            self.resize_and_crop_single_segmentation_mask,
            inputs_for_resize_and_crop_single_segmentation_mask,
            fn_output_signature=segmentation_masks.dtype,
        )
        segmentation_masks = tf.ensure_shape(
            segmentation_masks, shape=(None, self.height, self.width, None)
        )
        return tf.cast(segmentation_masks, self.compute_dtype)

    def resize_and_crop_single_image(self, inputs):
        image = inputs.get("images", None)
        scaled_size = inputs.get("scaled_sizes", None)
        offset = inputs.get("offsets", None)
        padding = inputs.get("paddings", None)

        image = tf.image.resize(
            image,
            tf.cast(scaled_size, tf.int32),
            method=self.interpolation,
            antialias=self.antialias,
        )
        image = image[
            offset[0] : offset[0] + self.crop_height,
            offset[1] : offset[1] + self.crop_width,
            :,
        ]
        paddings = tf.stack(
            (
                tf.stack((padding[0], padding[1])),
                tf.stack((padding[2], padding[3])),
                tf.zeros(shape=(2,), dtype=tf.int32),
            )
        )
        image = tf.pad(
            image, paddings=paddings, constant_values=self.padding_value
        )
        return image

    def resize_and_crop_single_segmentation_mask(self, inputs):
        segmentation_mask = inputs.get("segmentation_masks", None)
        scaled_size = inputs.get("scaled_sizes", None)
        offset = inputs.get("offsets", None)
        padding = inputs.get("paddings", None)

        segmentation_mask = tf.image.resize(
            segmentation_mask, tf.cast(scaled_size, tf.int32), method="nearest"
        )
        segmentation_mask = segmentation_mask[
            offset[0] : offset[0] + self.crop_height,
            offset[1] : offset[1] + self.crop_width,
            :,
        ]
        paddings = tf.stack(
            (
                tf.stack((padding[0], padding[1])),
                tf.stack((padding[2], padding[3])),
                tf.zeros(shape=(2,), dtype=tf.int32),
            )
        )
        segmentation_mask = tf.pad(
            segmentation_mask,
            paddings=paddings,
            constant_values=self.padding_value,
        )
        return segmentation_mask

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "height": self.height,
                "width": self.width,
                "scale_factor": self.scale_factor,
                "crop_height": self.crop_height,
                "crop_width": self.crop_width,
                "interpolation": self.interpolation,
                "bounding_box_format": self.bounding_box_format,
                "seed": self.seed,
            }
        )
        return config